preprocess.py 993 B

1234567891011121314151617181920212223242526272829303132333435363738394041
  1. import sys
  2. from transformers import AutoTokenizer
  3. dataset = sys.argv[1]
  4. model_name_or_path = sys.argv[2]
  5. max_len = int(sys.argv[3])
  6. subword_len_counter = 0
  7. tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
  8. max_len -= tokenizer.num_special_tokens_to_add()
  9. with open(dataset, "rt") as f_p:
  10. for line in f_p:
  11. line = line.rstrip()
  12. if not line:
  13. print(line)
  14. subword_len_counter = 0
  15. continue
  16. token = line.split()[0]
  17. current_subwords_len = len(tokenizer.tokenize(token))
  18. # Token contains strange control characters like \x96 or \x95
  19. # Just filter out the complete line
  20. if current_subwords_len == 0:
  21. continue
  22. if (subword_len_counter + current_subwords_len) > max_len:
  23. print("")
  24. print(line)
  25. subword_len_counter = current_subwords_len
  26. continue
  27. subword_len_counter += current_subwords_len
  28. print(line)
Tip!

Press p or to see the previous file or, n or to see the next file