Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

text_preprocessing.py 2.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
  1. import numpy as np
  2. import pandas as pd
  3. import os
  4. from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
  5. from nltk.tokenize import RegexpTokenizer
  6. from config import *
  7. def tokenize(file_path):
  8. """
  9. Tokenize the text of a file referred to by file_path.
  10. """
  11. with open(file_path, encoding='latin-1', mode='r') as f:
  12. text = f.read()
  13. text = text.lower()
  14. tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
  15. tokens = tokenizer.tokenize(text)
  16. tokens = [w for w in tokens if len(w) > 2] # ignore a, an, to, at, be, ...
  17. tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]
  18. return tokens
  19. print('[DEBUG] Started text preprocessing')
  20. print('--------------------------------')
  21. # load text from training data
  22. train_file_list = []
  23. for root, dirs, files in os.walk(TRAIN_ROOT_PATH):
  24. # excluding hidden files
  25. files = [f for f in files if not f[0] == '.']
  26. dirs[:] = [d for d in dirs if not d[0] == '.']
  27. for name in files:
  28. train_file_list.append(os.path.join(root, name))
  29. print('[DEBUG] Obtained text files for training')
  30. # feature engineering
  31. context_train = []
  32. target_train = []
  33. for file_path in train_file_list:
  34. tokens = tokenize(file_path)
  35. # unique_tokens.append(np.unique(tokens))
  36. for i in range(len(tokens) - CONTEXT_WINDOW):
  37. context_train.append(tokens[i:i + CONTEXT_WINDOW])
  38. target_train.append(tokens[i + CONTEXT_WINDOW])
  39. # create features and target for model training
  40. df_train = pd.DataFrame(context_train, columns=[f'context_{i}' for i in range(CONTEXT_WINDOW)])
  41. df_train['target'] = np.array(target_train)
  42. print('[DEBUG] Done context-target-split in text for training')
  43. # load text from test data
  44. test_file_list = []
  45. for root, dirs, files in os.walk(TEST_ROOT_PATH):
  46. # excluding hidden files
  47. files = [f for f in files if not f[0] == '.']
  48. dirs[:] = [d for d in dirs if not d[0] == '.']
  49. for name in files:
  50. test_file_list.append(os.path.join(root, name))
  51. print('[DEBUG] Obtained text files for testing')
  52. # feature engineering
  53. context_test = []
  54. target_test = []
  55. for file_path in test_file_list:
  56. tokens = tokenize(file_path)
  57. for i in range(len(tokens) - CONTEXT_WINDOW):
  58. context_test.append(tokens[i:i + CONTEXT_WINDOW])
  59. target_test.append([tokens[i + CONTEXT_WINDOW]])
  60. df_test = pd.DataFrame(context_test, columns=[f'context_{i}' for i in range(CONTEXT_WINDOW)])
  61. df_test['target'] = np.array(target_test)
  62. print('[DEBUG] Done context-target-split in text for testing')
  63. # store training and test tabular data in csv files
  64. df_train.to_csv(PROCESSED_TRAIN_PATH, index=False)
  65. df_test.to_csv(PROCESSED_TEST_PATH, index=False)
  66. print('[DEBUG] Stored training and test data of context vs target words in csv files')# and unique tokens in pickle file')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...