Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare_data.py 4.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
  1. from . import shared
  2. from .shared import save
  3. import pandas as pd
  4. def split_train_test(ratio=0.2, random_seed=42):
  5. from sklearn.model_selection import train_test_split
  6. df = pd.read_csv(shared.raw_data, encoding='utf-8')
  7. df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
  8. df_positive = df[df[shared.col_label]]
  9. df_negative = df[df[shared.col_label] != True]
  10. train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
  11. train_df.to_csv(shared.train_data, index=False)
  12. test_df.to_csv(shared.test_data, index=False)
  13. from html.parser import HTMLParser
  14. class MLStripper(HTMLParser):
  15. def __init__(self):
  16. super().__init__()
  17. self.reset()
  18. self.strict = False
  19. self.convert_charrefs= True
  20. self.fed = []
  21. def handle_data(self, d):
  22. self.fed.append(d)
  23. def get_data(self):
  24. return ''.join(self.fed)
  25. def text_preprocess(s):
  26. strip = MLStripper()
  27. strip.feed(s.lower())
  28. return strip.get_data()
  29. def text_col(df):
  30. return (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False).apply(text_preprocess)
  31. import re
  32. token_pattern = re.compile(r"(?u)\b\w\w+\b")
  33. # TODO: Better number pattern
  34. number_pattern = re.compile(r"^\d+e?|e?\d+$")
  35. def tokenizer(s):
  36. """
  37. Turns numeric tokens into a single <num> token, to prevent lots of redundant terms for different numbers
  38. """
  39. tokens = token_pattern.findall(s)
  40. return ["<num>" if number_pattern.match(t) else t for t in tokens]
  41. def build_pipeline():
  42. from sklearn.impute import SimpleImputer
  43. from sklearn.preprocessing import StandardScaler, FunctionTransformer
  44. from sklearn.feature_extraction.text import TfidfVectorizer
  45. from sklearn.pipeline import make_pipeline
  46. from sklearn.compose import ColumnTransformer
  47. import tutorial.prepare_data
  48. tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000, ngram_range=(1, 2), tokenizer=tutorial.prepare_data.tokenizer)
  49. from tutorial import prepare_data # Required for proper pickling of this pipeline
  50. return ColumnTransformer([
  51. ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
  52. ('num_cols', make_pipeline(SimpleImputer(),StandardScaler()), shared.extra_feature_cols),
  53. ('tfidf', make_pipeline(FunctionTransformer(prepare_data.text_col), tfidf), shared.text_cols)
  54. ])
  55. def map_dataframe(df, pipeline):
  56. tfidf_cols = [f'Text_{col}' for col in pipeline.named_transformers_.tfidf[1].get_feature_names()]
  57. cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols + tfidf_cols
  58. return pd.DataFrame.sparse.from_spmatrix(pipeline.transform(df), columns=cols)
  59. def prepare_data():
  60. pipeline = build_pipeline()
  61. print("Loading train data")
  62. train_df = pd.read_csv(shared.train_data, encoding='utf-8')
  63. print("Loading test data")
  64. test_df = pd.read_csv(shared.test_data, encoding='utf-8')
  65. print("Done")
  66. print("Fitting the pipeline...")
  67. pipeline.fit(train_df)
  68. print("Done")
  69. print("Transforming data")
  70. train_df = map_dataframe(train_df, pipeline)
  71. test_df = map_dataframe(test_df, pipeline)
  72. print("Done")
  73. save(pipeline, shared.pipeline_pkl)
  74. print("Saving training data")
  75. train_df.to_pickle(shared.train_processed)
  76. print("Saving test data")
  77. test_df.to_pickle(shared.test_processed)
  78. print("Done")
  79. from dagshub import dagshub_logger
  80. with dagshub_logger(should_log_metrics=False, hparams_path='pipeline-params.yml') as logger:
  81. params = {k:v for k,v in pipeline.get_params().items() if v is None or type(v) in [str,int,float,bool]}
  82. print('Logging pipeline params:')
  83. print(params)
  84. logger.log_hyperparams(params)
  85. def main():
  86. print("Splitting train and test...")
  87. split_train_test()
  88. print("Done")
  89. print("Preparing data...")
  90. prepare_data()
  91. print("Done")
  92. if __name__ == "__main__":
  93. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...