-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.sh
51 lines (41 loc) · 2.26 KB
/
preprocess.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env bash
###########################################################
DATASET_NAME=testing
INPUT_DIR=raw_code
OUTPUT_DIR=output/${DATASET_NAME}
MAX_CONTEXTS=200
WORD_VOCAB_SIZE=1301136
PATH_VOCAB_SIZE=911417
TARGET_VOCAB_SIZE=261245
NUM_THREADS=4
PYTHON=python3
TRAIN_DATA_FILE=${OUTPUT_DIR}/train.raw.txt
VAL_DATA_FILE=${OUTPUT_DIR}/validation.raw.txt
TEST_DATA_FILE=${OUTPUT_DIR}/test.raw.txt
TARGET_HISTOGRAM_FILE=${OUTPUT_DIR}/${DATASET_NAME}.histo.tgt.c2v
ORIGIN_HISTOGRAM_FILE=${OUTPUT_DIR}/${DATASET_NAME}.histo.ori.c2v
PATH_HISTOGRAM_FILE=${OUTPUT_DIR}/${DATASET_NAME}.histo.path.c2v
###########################################################
mkdir -p "${OUTPUT_DIR}"
echo "======== Building extractor... ==========="
npm run build
echo "======== Extracting JS files from: ${INPUT_DIR} ========"
node build/index.js -i "${INPUT_DIR}" -o "${OUTPUT_DIR}"
echo ""
echo "======== Creating histograms from the training data ========"
# histogram of the labels
cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE}
# histogram of all source/target words
cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${ORIGIN_HISTOGRAM_FILE}
# histogram of all the path hashes
cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${PATH_HISTOGRAM_FILE}
echo ""
echo "======== Preprocessing histogram files and raw text files ========"
${PYTHON} preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \
--max_contexts ${MAX_CONTEXTS} --word_vocab_size ${WORD_VOCAB_SIZE} --path_vocab_size ${PATH_VOCAB_SIZE} \
--target_vocab_size ${TARGET_VOCAB_SIZE} --word_histogram ${ORIGIN_HISTOGRAM_FILE} \
--path_histogram ${PATH_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name ${OUTPUT_DIR}/${DATASET_NAME}
# If all went well, the raw data files can be deleted, because preprocess.py creates new files
# with truncated and padded number of paths for each example.
# rm ${TRAIN_DATA_FILE} ${VAL_DATA_FILE} ${TEST_DATA_FILE} ${TARGET_HISTOGRAM_FILE} ${SOURCE_SUBTOKEN_HISTOGRAM} \
# ${NODE_HISTOGRAM_FILE}