1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
|
- from . import shared
- from .shared import save
- import pandas as pd
- def split_train_test(ratio=0.2, random_seed=42):
- from sklearn.model_selection import train_test_split
- df = pd.read_csv(shared.raw_data, encoding='utf-8')
- df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
- df_positive = df[df[shared.col_label]]
- df_negative = df[df[shared.col_label] != True]
- train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
- train_df.to_csv(shared.train_data, index=False)
- test_df.to_csv(shared.test_data, index=False)
- from html.parser import HTMLParser
- class MLStripper(HTMLParser):
- def __init__(self):
- super().__init__()
- self.reset()
- self.strict = False
- self.convert_charrefs= True
- self.fed = []
-
- def handle_data(self, d):
- self.fed.append(d)
-
- def get_data(self):
- return ''.join(self.fed)
- def text_preprocess(s):
- strip = MLStripper()
- strip.feed(s.lower())
- return strip.get_data()
- def text_col(df):
- return (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False).apply(text_preprocess)
- import re
- token_pattern = re.compile(r"(?u)\b\w\w+\b")
- # TODO: Better number pattern
- number_pattern = re.compile(r"^\d+e?|e?\d+$")
- def tokenizer(s):
- """
- Turns numeric tokens into a single <num> token, to prevent lots of redundant terms for different numbers
- """
- tokens = token_pattern.findall(s)
- return ["<num>" if number_pattern.match(t) else t for t in tokens]
-
- def build_pipeline():
- from sklearn.impute import SimpleImputer
- from sklearn.preprocessing import StandardScaler, FunctionTransformer
- from sklearn.feature_extraction.text import TfidfVectorizer
- from sklearn.pipeline import make_pipeline
- from sklearn.compose import ColumnTransformer
- import tutorial.prepare_data
- tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000, ngram_range=(1, 2), tokenizer=tutorial.prepare_data.tokenizer)
- from tutorial import prepare_data # Required for proper pickling of this pipeline
- return ColumnTransformer([
- ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
- ('num_cols', make_pipeline(SimpleImputer(),StandardScaler()), shared.extra_feature_cols),
- ('tfidf', make_pipeline(FunctionTransformer(prepare_data.text_col), tfidf), shared.text_cols)
- ])
- def map_dataframe(df, pipeline):
- tfidf_cols = [f'Text_{col}' for col in pipeline.named_transformers_.tfidf[1].get_feature_names()]
- cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols + tfidf_cols
- return pd.DataFrame.sparse.from_spmatrix(pipeline.transform(df), columns=cols)
- def prepare_data():
- pipeline = build_pipeline()
- print("Loading train data")
- train_df = pd.read_csv(shared.train_data, encoding='utf-8')
- print("Loading test data")
- test_df = pd.read_csv(shared.test_data, encoding='utf-8')
- print("Done")
-
- print("Fitting the pipeline...")
- pipeline.fit(train_df)
- print("Done")
- print("Transforming data")
- train_df = map_dataframe(train_df, pipeline)
- test_df = map_dataframe(test_df, pipeline)
- print("Done")
- save(pipeline, shared.pipeline_pkl)
- print("Saving training data")
- train_df.to_pickle(shared.train_processed)
- print("Saving test data")
- test_df.to_pickle(shared.test_processed)
- print("Done")
- from dagshub import dagshub_logger
- with dagshub_logger(should_log_metrics=False, hparams_path='pipeline-params.yml') as logger:
- params = {k:v for k,v in pipeline.get_params().items() if v is None or type(v) in [str,int,float,bool]}
- print('Logging pipeline params:')
- print(params)
- logger.log_hyperparams(params)
- def main():
- print("Splitting train and test...")
- split_train_test()
- print("Done")
- print("Preparing data...")
- prepare_data()
- print("Done")
- if __name__ == "__main__":
- main()
-
|