Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare_data.py 4.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
  1. from . import shared
  2. from .shared import save
  3. import pandas as pd
  4. def split_train_test(ratio=0.2, random_seed=42):
  5. from sklearn.model_selection import train_test_split
  6. df = pd.read_csv(shared.raw_data, encoding='utf-8')
  7. df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
  8. train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
  9. train_df.to_csv(shared.train_data, index=False)
  10. test_df.to_csv(shared.test_data, index=False)
  11. from html.parser import HTMLParser
  12. class MLStripper(HTMLParser):
  13. def __init__(self):
  14. super().__init__()
  15. self.reset()
  16. self.strict = False
  17. self.convert_charrefs= True
  18. self.fed = []
  19. def handle_data(self, d):
  20. self.fed.append(d)
  21. def get_data(self):
  22. return ''.join(self.fed)
  23. def text_preprocess(s):
  24. strip = MLStripper()
  25. strip.feed(s.lower())
  26. return strip.get_data()
  27. def text_col(df):
  28. return (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False).apply(text_preprocess)
  29. def text_len_col(text):
  30. return text.str.len().values.reshape(-1, 1)
  31. import re
  32. token_pattern = re.compile(r"(?u)\b\w\w+\b")
  33. # TODO: Better number pattern
  34. number_pattern = re.compile(r"^\d+e?|e?\d+$")
  35. def tokenizer(s):
  36. """
  37. Turns numeric tokens into a single <num> token, to prevent lots of redundant terms for different numbers
  38. """
  39. tokens = token_pattern.findall(s)
  40. return ["<num>" if number_pattern.match(t) else t for t in tokens]
  41. def build_pipeline():
  42. from sklearn.impute import SimpleImputer
  43. from sklearn.preprocessing import StandardScaler, FunctionTransformer
  44. from sklearn.feature_extraction.text import TfidfVectorizer
  45. from sklearn.pipeline import make_pipeline, make_union
  46. from sklearn.compose import ColumnTransformer
  47. from tutorial import prepare_data # Required for proper pickling of this pipeline
  48. tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000, ngram_range=(1, 2), tokenizer=prepare_data.tokenizer)
  49. union = make_union(FunctionTransformer(prepare_data.text_len_col), tfidf)
  50. return ColumnTransformer([
  51. ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
  52. ('num_cols', make_pipeline(SimpleImputer(), StandardScaler()), shared.extra_feature_cols),
  53. ('text', make_pipeline(FunctionTransformer(prepare_data.text_col), union), shared.text_cols)
  54. ])
  55. def map_dataframe(df, pipeline):
  56. tfidf_cols = [f'Text_{col}' for col in pipeline.named_transformers_.text[1].transformer_list[1][1].get_feature_names()]
  57. cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols + [shared.text_len_col] + tfidf_cols
  58. return pd.DataFrame.sparse.from_spmatrix(pipeline.transform(df).astype(float), columns=cols)
  59. def prepare_data():
  60. pipeline = build_pipeline()
  61. print("Loading train data")
  62. train_df = pd.read_csv(shared.train_data, encoding='utf-8')
  63. print("Loading test data")
  64. test_df = pd.read_csv(shared.test_data, encoding='utf-8')
  65. print("Done")
  66. print("Fitting the pipeline...")
  67. pipeline.fit(train_df)
  68. print("Done")
  69. print("Transforming data")
  70. train_df = map_dataframe(train_df, pipeline)
  71. test_df = map_dataframe(test_df, pipeline)
  72. print("Done")
  73. save(pipeline, shared.pipeline_pkl)
  74. print("Saving training data")
  75. train_df.to_pickle(shared.train_processed)
  76. print("Saving test data")
  77. test_df.to_pickle(shared.test_processed)
  78. print("Done")
  79. from dagshub import dagshub_logger
  80. with dagshub_logger(should_log_metrics=False, hparams_path='pipeline-params.yml') as logger:
  81. params = {k:v for k,v in pipeline.get_params().items() if v is None or type(v) in [str,int,float,bool]}
  82. print('Logging pipeline params:')
  83. print(params)
  84. logger.log_hyperparams(params)
  85. def main():
  86. print("Splitting train and test...")
  87. split_train_test()
  88. print("Done")
  89. print("Preparing data...")
  90. prepare_data()
  91. print("Done")
  92. if __name__ == "__main__":
  93. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...