preprocess.sh 1.4 KB

123456789101112131415161718192021222324252627282930
  1. #!/bin/bash
  2. ENTITIES="NCBI-disease BC5CDR-disease BC5CDR-chem BC4CHEMD JNLPBA BC2GM linnaeus s800"
  3. MAX_LENGTH=128
  4. SCRIPT_FILE=named-entity-recognition/scripts/preprocess.py
  5. for ENTITY in $ENTITIES
  6. do
  7. echo "***** " $ENTITY " Preprocessing Start *****"
  8. DATA_DIR=datasets/NER/$ENTITY
  9. PREPROCESSED_DIR=preprocessed_datasets/NER/$ENTITY
  10. mkdir -p $PREPROCESSED_DIR
  11. # Replace tab to space
  12. cat $DATA_DIR/train.tsv | tr '\t' ' ' > $PREPROCESSED_DIR/train.txt.tmp
  13. cat $DATA_DIR/devel.tsv | tr '\t' ' ' > $PREPROCESSED_DIR/devel.txt.tmp
  14. cat $DATA_DIR/train_dev.tsv | tr '\t' ' ' > $PREPROCESSED_DIR/train_dev.txt.tmp
  15. cat $DATA_DIR/test.tsv | tr '\t' ' ' > $PREPROCESSED_DIR/test.txt.tmp
  16. echo "Replacing Done"
  17. # Preprocess for BERT-based models
  18. python $SCRIPT_FILE $PREPROCESSED_DIR/train.txt.tmp bert-base-cased $MAX_LENGTH > $PREPROCESSED_DIR/train.txt
  19. python $SCRIPT_FILE $PREPROCESSED_DIR/devel.txt.tmp bert-base-cased $MAX_LENGTH > $PREPROCESSED_DIR/devel.txt
  20. python $SCRIPT_FILE $PREPROCESSED_DIR/train_dev.txt.tmp bert-base-cased $MAX_LENGTH > $PREPROCESSED_DIR/train_dev.txt
  21. python $SCRIPT_FILE $PREPROCESSED_DIR/test.txt.tmp bert-base-cased $MAX_LENGTH > $PREPROCESSED_DIR/test.txt
  22. cat $PREPROCESSED_DIR/train.txt $PREPROCESSED_DIR/devel.txt $PREPROCESSED_DIR/test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > $PREPROCESSED_DIR/labels.txt
  23. echo "Removing .tmp files"
  24. rm $PREPROCESSED_DIR/*.tmp
  25. echo "***** " $ENTITY " Preprocessing Done *****"
  26. done
Tip!

Press p or to see the previous file or, n or to see the next file