Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

preprocess.sh 4.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
  1. #!/usr/bin/env bash
  2. ###########################################################
  3. # Change the following values to preprocess a new dataset.
  4. # TRAIN_DIR, VAL_DIR and TEST_DIR should be paths to
  5. # directories containing sub-directories with .java files
  6. # each of {TRAIN_DIR, VAL_DIR and TEST_DIR} should have sub-dirs,
  7. # and data will be extracted from .java files found in those sub-dirs).
  8. # DATASET_NAME is just a name for the currently extracted
  9. # dataset.
  10. # MAX_CONTEXTS is the number of contexts to keep for each
  11. # method (by default 200).
  12. # WORD_VOCAB_SIZE, PATH_VOCAB_SIZE, TARGET_VOCAB_SIZE -
  13. # - the number of words, paths and target words to keep
  14. # in the vocabulary (the top occurring words and paths will be kept).
  15. # The default values are reasonable for a Tesla K80 GPU
  16. # and newer (12 GB of board memory).
  17. # NUM_THREADS - the number of parallel threads to use. It is
  18. # recommended to use a multi-core machine for the preprocessing
  19. # step and set this value to the number of cores.
  20. # PYTHON - python3 interpreter alias.
  21. # cd2vec/java-small_raw/test/my_test
  22. # cd2vec/python/my_test
  23. TRAIN_DIR=cd2vec/python/my_train/
  24. VAL_DIR=cd2vec/python/my_val/
  25. TEST_DIR=cd2vec/python/my_test/
  26. DATASET_NAME=my_dataset
  27. MAX_CONTEXTS=200
  28. WORD_VOCAB_SIZE=50000
  29. PATH_VOCAB_SIZE=50000
  30. TARGET_VOCAB_SIZE=3
  31. NUM_THREADS=64
  32. PYTHON=python3
  33. JAVA=java
  34. ###########################################################
  35. TRAIN_DATA_FILE=${DATASET_NAME}_train
  36. VAL_DATA_FILE=${DATASET_NAME}_val
  37. TEST_DATA_FILE=${DATASET_NAME}_test
  38. EXTRACTOR_JAR=cd2vec/cli.jar
  39. mkdir -p data
  40. mkdir -p data/${DATASET_NAME}
  41. echo "Extracting paths from validation set..."
  42. ${JAVA} -jar ${EXTRACTOR_JAR} code2vec --lang py --project ${VAL_DIR} --output ${VAL_DATA_FILE} --maxH 8 --maxW 2 --maxContexts ${MAX_CONTEXTS} --maxTokens ${WORD_VOCAB_SIZE} --maxPaths ${PATH_VOCAB_SIZE}
  43. echo "Finished extracting paths from validation set"
  44. echo "Extracting paths from test set..."
  45. ${JAVA} -jar ${EXTRACTOR_JAR} code2vec --lang py --project ${TEST_DIR} --output ${TEST_DATA_FILE} --maxH 8 --maxW 2 --maxContexts ${MAX_CONTEXTS} --maxTokens ${WORD_VOCAB_SIZE} --maxPaths ${PATH_VOCAB_SIZE}
  46. echo "Finished extracting paths from test set"
  47. echo "Extracting paths from training set..."
  48. ${JAVA} -Xmx8g -jar ${EXTRACTOR_JAR} code2vec --lang py --project ${TRAIN_DIR} --output ${TRAIN_DATA_FILE} --maxH 8 --maxW 2 --maxContexts ${MAX_CONTEXTS} --maxTokens ${WORD_VOCAB_SIZE} --maxPaths ${PATH_VOCAB_SIZE}
  49. echo "Finished extracting paths from training set"
  50. cat ${VAL_DATA_FILE}/path_*csv > ${VAL_DATA_FILE}/combined.csv
  51. cat ${TEST_DATA_FILE}/path_*csv > ${TEST_DATA_FILE}/combined.csv
  52. cat ${TRAIN_DATA_FILE}/path_*csv > ${TRAIN_DATA_FILE}/combined.csv
  53. VAL_DATA_FILE=${VAL_DATA_FILE}/combined.csv
  54. TEST_DATA_FILE=${TEST_DATA_FILE}/combined.csv
  55. TRAIN_DATA_FILE=${TRAIN_DATA_FILE}/combined.csv
  56. TARGET_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.tgt.c2v
  57. ORIGIN_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.ori.c2v
  58. PATH_HISTOGRAM_FILE=data/${DATASET_NAME}/${DATASET_NAME}.histo.path.c2v
  59. echo "Creating histograms from the training data"
  60. cat ${TRAIN_DATA_FILE} | cut -d' ' -f1 | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${TARGET_HISTOGRAM_FILE}
  61. cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f1,3 | tr ',' '\n' | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${ORIGIN_HISTOGRAM_FILE}
  62. cat ${TRAIN_DATA_FILE} | cut -d' ' -f2- | tr ' ' '\n' | cut -d',' -f2 | awk '{n[$0]++} END {for (i in n) print i,n[i]}' > ${PATH_HISTOGRAM_FILE}
  63. ${PYTHON} preprocess.py --train_data ${TRAIN_DATA_FILE} --test_data ${TEST_DATA_FILE} --val_data ${VAL_DATA_FILE} \
  64. --max_contexts ${MAX_CONTEXTS} --word_vocab_size ${WORD_VOCAB_SIZE} --path_vocab_size ${PATH_VOCAB_SIZE} \
  65. --target_vocab_size ${TARGET_VOCAB_SIZE} --word_histogram ${ORIGIN_HISTOGRAM_FILE} \
  66. --path_histogram ${PATH_HISTOGRAM_FILE} --target_histogram ${TARGET_HISTOGRAM_FILE} --output_name data/${DATASET_NAME}/${DATASET_NAME}
  67. # If all went well, the raw data files can be deleted, because preprocess.py creates new files
  68. # with truncated and padded number of paths for each example.
  69. rm ${TARGET_HISTOGRAM_FILE} ${ORIGIN_HISTOGRAM_FILE} ${PATH_HISTOGRAM_FILE}
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...