Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

logreg_classifier.py 6.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
  1. ##TODO:
  2. # Arguments from the input
  3. # Excessive functions -> parameters
  4. # Common module
  5. import pandas as pd
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. import scipy.stats
  9. from sklearn.feature_extraction.text import TfidfVectorizer
  10. from sklearn.linear_model import LogisticRegression
  11. from sklearn.multioutput import MultiOutputRegressor
  12. from sklearn.model_selection import train_test_split
  13. from sklearn.metrics import *
  14. import dagshub
  15. import pickle
  16. def load_corpus(DATASET_PATH, CODE_COLUMN):
  17. df = pd.read_csv(DATASET_PATH, encoding='utf-8', comment='#', sep='\t')#, quoting=csv.QUOTE_NONE, error_bad_lines=False)#, sep=','
  18. df.dropna(axis=0, inplace=True)
  19. corpus = df[CODE_COLUMN]
  20. test_size = 0.1
  21. test_rows = round(df.shape[0]*test_size)
  22. train_rows = df.shape[0] - test_rows
  23. train_corpus = df[CODE_COLUMN][0:test_rows]
  24. test_corpus = df[CODE_COLUMN][train_rows:]
  25. return df, corpus
  26. def tfidf_transform(corpus, tfidf_params, TFIDF_DIR):
  27. tfidf = pickle.load(open(TFIDF_DIR, 'rb'))
  28. features = tfidf.transform(corpus)
  29. return features
  30. def tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR):
  31. tfidf = TfidfVectorizer(tfidf_params)
  32. print(code_blocks.head())
  33. tfidf = tfidf.fit(code_blocks)
  34. pickle.dump(tfidf, open(TFIDF_DIR, "wb"))
  35. code_blocks_tfidf = tfidf.transform(code_blocks)
  36. return code_blocks_tfidf
  37. def logreg_evaluate(df, code_blocks, TAG_TO_PREDICT):
  38. code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
  39. X_train, X_test, y_train, y_test = train_test_split(code_blocks_tfidf, df[TAG_TO_PREDICT], test_size=0.25)
  40. clf = LogisticRegression(random_state=421).fit(X_train, y_train)
  41. print("saving the model")
  42. pickle.dump(clf, open(MODEL_DIR, 'wb'))
  43. y_pred = clf.predict(X_test)
  44. accuracy = clf.score(X_test, y_test)
  45. f1 = f1_score(y_pred, y_test, average='weighted')
  46. print(f'Mean Accuracy {round(accuracy*100, 2)}%')
  47. print(f'F1-score {round(f1*100, 2)}%')
  48. errors = y_test - y_pred
  49. plt.hist(errors)
  50. plot_precision_recall_curve(clf, X_test, y_test)
  51. plot_confusion_matrix(clf, X_test, y_test, values_format='d')
  52. def mean_confidence_interval(data, confidence=0.95):
  53. a = 1.0 * np.array(data)
  54. n = len(a)
  55. m, se = np.mean(a), scipy.stats.sem(a)
  56. h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
  57. return m, m-h, m+h
  58. conf_interval = mean_confidence_interval(errors, 0.95)
  59. print(conf_interval)
  60. metrics = {'test_accuracy': accuracy
  61. , 'test_f1_score': f1}
  62. return metrics
  63. def logreg_multioutput_evaluate(df, code_blocks, TAGS_TO_PREDICT):
  64. code_blocks_tfidf = tfidf_fit_transform(code_blocks, tfidf_params, TFIDF_DIR)
  65. print("splitting")
  66. X_train, X_test, Y_train, Y_test = train_test_split(code_blocks_tfidf, df[TAGS_TO_PREDICT], test_size=0.25)
  67. print("training the model")
  68. clf = MultiOutputRegressor(LogisticRegression(random_state=421)).fit(X_train, Y_train)
  69. print("saving the model")
  70. pickle.dump(clf, open(MODEL_DIR, 'wb'))
  71. Y_pred = clf.predict(X_test)
  72. accuracy = clf.score(X_test, Y_test)
  73. f1 = f1_score(Y_pred, Y_test, average='weighted')
  74. print(f'Mean Accuracy {round(accuracy*100, 2)}%')
  75. print(f'F1-score {round(f1*100, 2)}%')
  76. # errors = Y_test - Y_pred
  77. # plt.hist(errors)
  78. # plot_precision_recall_curve(clf, X_test, Y_test)
  79. # plot_confusion_matrix(clf, X_test, Y_test, values_format='d')
  80. # def mean_confidence_interval(data, confidence=0.95):
  81. # a = 1.0 * np.array(data)
  82. # n = len(a)
  83. # m, se = np.mean(a), scipy.stats.sem(a)
  84. # h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
  85. # return m, m-h, m+h
  86. # conf_interval = mean_confidence_interval(errors, 0.95)
  87. # print(conf_interval)
  88. metrics = {'test_accuracy': accuracy
  89. , 'test_f1_score': f1}
  90. return metrics
  91. def get_predictions(X, y, TAGS_TO_PREDICT, MODEL_DIR):
  92. clf = pickle.load(open(MODEL_DIR, 'rb'))
  93. # result = loaded_model.score(X, y)
  94. y_pred = clf.predict(X)
  95. accuracy = accuracy_score(y_pred, y)
  96. f1 = f1_score(y_pred, y, average='weighted')
  97. print(f'Mean Accuracy {round(accuracy*100, 2)}%')
  98. print(f'F1-score {round(f1*100, 2)}%')
  99. errors = y - y_pred
  100. plt.hist(errors)
  101. plot_precision_recall_curve(clf, X, y)
  102. plot_confusion_matrix(clf, X, y, values_format='d')
  103. def mean_confidence_interval(data, confidence=0.95):
  104. a = 1.0 * np.array(data)
  105. n = len(a)
  106. m, se = np.mean(a), scipy.stats.sem(a)
  107. h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
  108. return m, m-h, m+h
  109. conf_interval = mean_confidence_interval(errors, 0.95)
  110. print(conf_interval)
  111. metrics = {'test_accuracy': accuracy
  112. , 'test_f1_score': f1}
  113. return metrics
  114. GRAPH_VER = 5
  115. DATASET_PATH = './data/kaggle_10_regex_v{}.csv'.format(GRAPH_VER)
  116. MODEL_DIR = './models/logreg_regex_graph_v{}.sav'.format(GRAPH_VER)
  117. TFIDF_DIR = './models/tfidf_logreg_graph_v{}.pickle'.format(GRAPH_VER)
  118. CODE_COLUMN = 'code_block'
  119. TAGS_TO_PREDICT = ['import', 'data_import', 'data_export', 'preprocessing',
  120. 'visualization', 'model', 'train', 'predict']
  121. PREDICT_COL = 'pred_{}'.format(TAGS_TO_PREDICT)
  122. SCRIPT_DIR = 'logreg_classifier.ipynb'
  123. VAL_CHUNK_SIZE = 10
  124. VAL_CODE_COLUMN = 'code'
  125. VAL_TAGS_TO_PREDICT = 'tag'
  126. VAL_DATASET_PATH = './data/chunks_{}_validate.csv'.format(VAL_CHUNK_SIZE)
  127. if __name__ == '__main__':
  128. df, code_blocks = load_corpus(DATASET_PATH, CODE_COLUMN)
  129. nrows = df.shape[0]
  130. print("loaded")
  131. tfidf_params = {'min_df': 5
  132. , 'max_df': 0.3
  133. , 'smooth_idf': True}
  134. data_meta = {'DATASET_PATH': DATASET_PATH
  135. ,'nrows': nrows
  136. ,'label': TAGS_TO_PREDICT
  137. ,'model': MODEL_DIR
  138. ,'script_dir': SCRIPT_DIR
  139. ,'task': 'training and evaluation'}
  140. print("tfidf-ed")
  141. with dagshub.dagshub_logger() as logger:
  142. metrics = logreg_multioutput_evaluate(df, code_blocks, TAGS_TO_PREDICT)
  143. # metrics = get_predictions(features, df[TAGS_TO_PREDICT], TAGS_TO_PREDICT, MODEL_DIR)
  144. logger.log_hyperparams(data_meta)
  145. logger.log_hyperparams(tfidf_params)
  146. logger.log_metrics(metrics)
  147. print("finished")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...