1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
- from . import shared
- from .shared import save
- def split_train_test(ratio=0.2, random_seed=42):
- import pandas as pd
- from sklearn.model_selection import train_test_split
- df = pd.read_csv(shared.raw_data, encoding='utf-8')
- df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')
- df_positive = df[df[shared.col_label]]
- df_negative = df[df[shared.col_label] != True]
- train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])
- train_df.to_csv(shared.train_data, index=False)
- test_df.to_csv(shared.test_data, index=False)
- from html.parser import HTMLParser
- class MLStripper(HTMLParser):
- def __init__(self):
- super().__init__()
- self.reset()
- self.strict = False
- self.convert_charrefs= True
- self.fed = []
- def handle_data(self, d):
- self.fed.append(d)
- def get_data(self):
- return ''.join(self.fed)
- def text_preprocessing(s):
- strip = MLStripper()
- strip.feed(s.lower())
- return strip.get_data()
- # TODO: Clean HTML tags
- # return s.lower()
- def vectorize_text():
- import pandas as pd
- from sklearn.feature_extraction.text import TfidfVectorizer
-
- train_df = pd.read_csv(shared.train_data, encoding='utf-8')
- test_df = pd.read_csv(shared.test_data, encoding='utf-8')
- def text_col(df):
- df[shared.col_text] = (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False)
- text_col(train_df)
- text_col(test_df)
-
- vectorizer = TfidfVectorizer(encoding='utf-8', preprocessor=text_preprocessing, stop_words='english', analyzer='word', max_features=50000)
- vectorizer.fit(train_df[shared.col_text])
- train_tfidf_mat = vectorizer.transform(train_df[shared.col_text])
- test_tfidf_mat = vectorizer.transform(test_df[shared.col_text])
- return train_df, test_df, vectorizer, train_tfidf_mat, test_tfidf_mat
- def prepare_data():
- train_df, test_df, vectorizer, train_tfidf_mat, test_tfidf_mat = vectorize_text()
- save(vectorizer, shared.vectorizer_pkl)
- train_df[[shared.col_id, shared.col_text]].to_csv(shared.train_processed, index=False)
- test_df[[shared.col_id, shared.col_text]].to_csv(shared.test_processed, index=False)
- save(train_tfidf_mat, shared.train_tfidf)
- save(test_tfidf_mat, shared.test_tfidf)
- def main():
- split_train_test()
- prepare_data()
- if __name__ == "__main__":
- main()
-
|