Guy
/
stackexchange-tutorial


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
            from . import shared
from .shared import save
import pandas as pd


def split_train_test(ratio=0.2, random_seed=42):
    from sklearn.model_selection import train_test_split

    df = pd.read_csv(shared.raw_data, encoding='utf-8')
    df[shared.col_label] = df[shared.col_tags].fillna('').str.contains('machine-learning')

    train_df, test_df = train_test_split(df, test_size=ratio, random_state=random_seed, stratify=df[shared.col_label])

    train_df.to_csv(shared.train_data, index=False)
    test_df.to_csv(shared.test_data, index=False)


from html.parser import HTMLParser

class MLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    
    def handle_data(self, d):
        self.fed.append(d)
    
    def get_data(self):
        return ''.join(self.fed)


def text_preprocess(s):
    strip = MLStripper()
    strip.feed(s.lower())
    return strip.get_data()


def text_col(df):
    return (df[shared.col_title].fillna('') + df[shared.col_body].fillna('')).astype('U', copy=False).apply(text_preprocess)


def text_len_col(text):
    return text.str.len().values.reshape(-1, 1)


import re
token_pattern = re.compile(r"(?u)\b\w\w+\b")
# TODO: Better number pattern
number_pattern = re.compile(r"^\d+e?|e?\d+$")


def tokenizer(s):
    """
    Turns numeric tokens into a single <num> token, to prevent lots of redundant terms for different numbers
    """
    tokens = token_pattern.findall(s)
    return ["<num>" if number_pattern.match(t) else t for t in tokens]
    

def build_pipeline():
    from sklearn.impute import SimpleImputer
    from sklearn.preprocessing import StandardScaler, FunctionTransformer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.pipeline import make_pipeline, make_union
    from sklearn.compose import ColumnTransformer

    from tutorial import prepare_data # Required for proper pickling of this pipeline
    tfidf = TfidfVectorizer(encoding='utf-8', stop_words='english', analyzer='word', max_features=25000, ngram_range=(1, 2), tokenizer=prepare_data.tokenizer)
    union = make_union(FunctionTransformer(prepare_data.text_len_col), tfidf)
    return ColumnTransformer([
        ('passthrough', 'passthrough', [shared.col_id, shared.col_label]),
        ('num_cols', make_pipeline(SimpleImputer(), StandardScaler()), shared.extra_feature_cols),
        ('text', make_pipeline(FunctionTransformer(prepare_data.text_col), union), shared.text_cols)
    ])


def map_dataframe(df, pipeline):
    tfidf_cols = [f'Text_{col}' for col in pipeline.named_transformers_.text[1].transformer_list[1][1].get_feature_names()]
    cols = [shared.col_id, shared.col_label] + shared.extra_feature_cols + [shared.text_len_col] + tfidf_cols
    return pd.DataFrame.sparse.from_spmatrix(pipeline.transform(df).astype(float), columns=cols)


def prepare_data():
    pipeline = build_pipeline()

    print("Loading train data")
    train_df = pd.read_csv(shared.train_data, encoding='utf-8')
    print("Loading test data")
    test_df = pd.read_csv(shared.test_data, encoding='utf-8')
    print("Done")
    
    print("Fitting the pipeline...")
    pipeline.fit(train_df)
    print("Done")
    print("Transforming data")
    train_df = map_dataframe(train_df, pipeline)
    test_df = map_dataframe(test_df, pipeline)
    print("Done")

    save(pipeline, shared.pipeline_pkl)
    print("Saving training data")
    train_df.to_pickle(shared.train_processed)
    print("Saving test data")
    test_df.to_pickle(shared.test_processed)
    print("Done")
    from dagshub import dagshub_logger
    with dagshub_logger(should_log_metrics=False, hparams_path='pipeline-params.yml') as logger:
        params = {k:v for k,v in pipeline.get_params().items() if v is None or type(v) in [str,int,float,bool]}
        print('Logging pipeline params:')
        print(params)
        logger.log_hyperparams(params)


def main():
    print("Splitting train and test...")
    split_train_test()
    print("Done")
    print("Preparing data...")
    prepare_data()
    print("Done")


if __name__ == "__main__":
    main()