-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathgenerate_verl_data.sh
More file actions
executable file
·161 lines (140 loc) · 5.8 KB
/
generate_verl_data.sh
File metadata and controls
executable file
·161 lines (140 loc) · 5.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
MODEL=${1:-"Qwen/Qwen3-8B"} # Huggingface model (used for embedding)
DATASET=${2:-"lasgroup/verifiable-corpus"} # Huggingface dataset (training dataset)
BENCHMARK=${3:-"math-ai/aime25"} # Huggingface dataset (target benchmark)
DATA_LOCATION=${4="/users/$USER/data"} # Where to store the dataset files
PRESELECTION=${5:-"false"} # Alternative mean,SIFT, ... (nearest_neighbor)
PRESELECTION_SIZE=${6:-500000} # Number of most similar target questions for dataset
OVERWRITE=${7:-"false"} # Whether to overwrite previously created datasets
DATASET_CATEGORY=${8:-"false"} # Subset specification for training benchmark
BENCHMARK_CATEGORY=${9:-"false"} # Subset specification for target benchmark
EXEC=${10:-"false"} # Whether to execute dataset creation or just print commands
TRAINING_SET_LOCATION=${11:-""} # Set if training set should be copied from existing location
if [[ "$BENCHMARK_CATEGORY" == "false" ]]; then
BENCHMARK_NAME="${BENCHMARK//\//_}"
else
BENCHMARK_NAME="${BENCHMARK//\//_}_${BENCHMARK_CATEGORY}"
fi
if [[ "$DATASET_CATEGORY" == "false" ]]; then
DS_NAME="${DATASET//\//_}_${BENCHMARK_NAME}_${PRESELECTION_SIZE}"
else
DS_NAME="${DATASET//\//_}_${BENCHMARK_NAME}_${PRESELECTION_SIZE}_${DATASET_CATEGORY}"
fi
DATASET_QUESTION_KEY="description"
BENCHMARK_QUESTION_KEY="description"
CMD=""
### Load Benchmark dataset
BENCHMARK_DIR="$DATA_LOCATION/benchmark_datasets"
BENCHMARK_PATH="$BENCHMARK_DIR/${BENCHMARK_NAME}.json"
mkdir -p $BENCHMARK_DIR
if [[ "$OVERWRITE" == "true" || ! -f "$BENCHMARK_PATH" ]]; then
CMD+="python ${PYTHONPATH}data/load_dataset.py \
--dataset_name $BENCHMARK \
--output_path $BENCHMARK_PATH \
--start_idx 0 \
--category $BENCHMARK_CATEGORY;"
else
echo "Benchmark dataset already exists at $BENCHMARK_PATH."
fi
### Create embeddings
EMBEDDING_DIR="${DATA_LOCATION}/embeddings_${MODEL//\//_}"
mkdir -p "$EMBEDDING_DIR"
DATASET_EMBEDDING_PATH="$EMBEDDING_DIR/${DATASET//\//_}.npy"
BENCHMARK_EMBEDDING_PATH="$EMBEDDING_DIR/${BENCHMARK_NAME}.npy"
if [[ ! -f "$DATASET_EMBEDDING_PATH" ]]; then
CMD+="python ${PYTHONPATH}data/compute_embeddings.py \
--dataset_name $DATASET \
--category $DATASET_CATEGORY \
--output_path $DATASET_EMBEDDING_PATH \
--question_key $DATASET_QUESTION_KEY \
--start_data_index 0 \
--model_id $MODEL \
--batch_size 8;"
else
echo "Embeddings already exists at $DATASET_EMBEDDING_PATH."
echo "python ${PYTHONPATH}data/compute_embeddings.py \
--dataset_name $DATASET \
--category $DATASET_CATEGORY \
--output_path $DATASET_EMBEDDING_PATH \
--question_key $DATASET_QUESTION_KEY \
--start_data_index 0 \
--model_id $MODEL \
--batch_size 8;"
fi
if [[ "$OVERWRITE" == "true" || ! -f "$BENCHMARK_EMBEDDING_PATH" ]]; then
CMD+="python ${PYTHONPATH}data/compute_embeddings.py \
--dataset_name $BENCHMARK \
--output_path $BENCHMARK_EMBEDDING_PATH \
--question_key $BENCHMARK_QUESTION_KEY \
--model_id $MODEL \
--start_data_index 0 \
--batch_size 8;"
else
echo "Embeddings already exists at $BENCHMARK_EMBEDDING_PATH."
fi
### Reload dataset with embeddings
TTRL_DATASET_DIR="$DATA_LOCATION/verl_data/$DS_NAME"
TTRL_DATASET_TRAIN_PATH="$TTRL_DATASET_DIR/train.json"
TTRL_DATASET_TEST_PATH="$TTRL_DATASET_DIR/test.json"
mkdir -p $TTRL_DATASET_DIR
mkdir -p "${PYTHONPATH}TTRL/verl/data"
ln -sf "$DATA_LOCATION/verl_data" "${PYTHONPATH}TTRL/verl/data"
if [[ "$OVERWRITE" == "true" || ! -f "$BENCHMARK_PATH" ]]; then
CMD+="python ${PYTHONPATH}data/load_dataset.py \
--dataset_name $BENCHMARK \
--output_path $TTRL_DATASET_TEST_PATH \
--start_idx 0 \
--category $BENCHMARK_CATEGORY \
--embeddings_file $BENCHMARK_EMBEDDING_PATH;"
else
echo "Benchmark dataset already exists at $BENCHMARK_PATH."
fi
if [[ "$TRAINING_SET_LOCATION" == "" ]]; then
### Create similarities
SIMILARITIES_DIR="${DATA_LOCATION}/similarities_${MODEL//\//_}"
mkdir -p "$SIMILARITIES_DIR"
SIMILARITIES_PATH="$SIMILARITIES_DIR/${DATASET//\//_}_${BENCHMARK_NAME}.npy"
if [[ "$OVERWRITE" == "true" || ! -f "$SIMILARITIES_PATH" ]]; then
CMD+="python ${PYTHONPATH}data/compute_neighborhoods.py \
--embeddings $BENCHMARK_EMBEDDING_PATH \
--embeddings_2 $DATASET_EMBEDDING_PATH \
--method $PRESELECTION \
--output $SIMILARITIES_PATH;"
else
echo "Similarities already exists at $SIMILARITIES_PATH."
fi
### Sort dataset
DATASET_DIR="${DATA_LOCATION}/datasets_${MODEL//\//_}"
mkdir -p "$DATASET_DIR"
if [[ "$OVERWRITE" == "true" || ! -f "$DATASET_PATH" ]]; then
CMD+="python ${PYTHONPATH}data/sort_dataset.py \
--dataset_name $DATASET \
--output_path $TTRL_DATASET_TRAIN_PATH \
--similarities_file_name $SIMILARITIES_PATH \
--start_idx 0 \
--num_el $PRESELECTION_SIZE \
--category $DATASET_CATEGORY \
--embeddings_file $DATASET_EMBEDDING_PATH;"
else
echo "Sorted dataset already exists at $DATASET_PATH."
fi
### Preprocess train and test data
CMD+="python ${PYTHONPATH}data/preprocess.py \
--data_source $TTRL_DATASET_DIR
--test_only False;"
else
mkdir -p $TTRL_DATASET_DIR
### Copy Training Set
cp "$TRAINING_SET_LOCATION/train.json" "$TTRL_DATASET_DIR/train.json"
cp "$TRAINING_SET_LOCATION/train.parquet" "$TTRL_DATASET_DIR/train.parquet"
### Preprocess train and test data
CMD+="python ${PYTHONPATH}data/preprocess.py \
--data_source $TTRL_DATASET_DIR
--test_only True;"
fi
### Run command if execute is true
if [[ "$EXEC" == "true" ]]; then
echo -e $CMD
eval "$CMD"
else
echo -e $CMD
fi