1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
- import re
- import csv
- import json
- import argparse
- import pandas as pd
- import numpy as np
- from sklearn.metrics import f1_score, precision_score, recall_score
- import dagshub
- def tokens_search(df, tokens, new_column_name):
- df[new_column_name] = 0
- for i in range(len(df)):
- percents = str(round(100*i/len(df),1))
- print(percents + '%\r', end='')
- row = df[CODE_COLUMN][i]
- for token in tokens:
- result = re.search(token.replace('(','\('), row)
- if result!=None:
- df[new_column_name][i] = 1
- break
- return df
- parser = argparse.ArgumentParser()
- parser.add_argument("GRAPH_VER", help="version of the graph you want regex to label your CSV with", type=int)
- parser.add_argument("DATASET_PATH", help="path to your input CSV", type=str)
- parser.add_argument("-eval", "--evaluation", help="evalute regex after creating", type=bool)
- args = parser.parse_args()
- GRAPH_VER = args.GRAPH_VER
- DATASET_PATH = args.DATASET_PATH
- evaluation = False
- if args.evaluation is not None:
- evaluation = args.evaluation
- OUTPUT_DATASET_PATH = '{}_regex_graph_v{}.csv'.format(DATASET_PATH[:-4], GRAPH_VER)
- CODE_COLUMN = 'code_block'
- GRAPH_DIR = './graph/graph_v{}.txt'.format(GRAPH_VER)
- if __name__ == '__main__':
- df = pd.read_csv(DATASET_PATH, encoding='utf-8', sep=',')
- print('opened input data')
- if df[CODE_COLUMN].isna().sum() > 0:
- print('Empty chunks found: {}'.format(df[CODE_COLUMN].isna().sum()))
- df = df.dropna(subset=[CODE_COLUMN]).reset_index()
- with open(GRAPH_DIR, "r") as graph_file:
- graph = json.load(graph_file)
- print('opened graph')
- vertices = []
- for i in range(0, len(graph)):
- vertex = list(graph.keys())[i]
- vertices.append(vertices)
- print('\n' + vertex)
- tokens = graph[vertex]
- df = tokens_search(df, tokens, vertex)
- print('labelled')
- df.to_csv(OUTPUT_DATASET_PATH, index=False)
- print('saved and finished')
- if evaluation:
- VALIDATION_DATA_PATH = "./data/golden_884_set.csv"
- TAGS = vertices
- REGEX_TAGS = [el + '_regex_v{}'.format(GRAPH_VER) for el in TAGS]
- regexed_data = pd.read_csv(VALIDATION_DATA_PATH)
- Y_test, Y_pred = regexed_data[TAGS], regexed_data[REGEX_TAGS]
- base_f1 = f1_score(Y_test, Y_pred, average='weighted')
- base_precision = precision_score(Y_test, Y_pred, average='weighted')
- base_recall = recall_score(Y_test, Y_pred, average='weighted')
- regex_results = {'test_f1_score': base_f1
- , 'test_precision': base_precision
- , 'test_recall': base_recall}
- for i, tag in enumerate(TAGS):
- tag_results = (round(f1_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
- round(precision_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
- round(recall_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4))
- print(tag)
- print(tag_results)
- regex_results.update({'test_f1_score_{}'.format(tag): tag_results[0]
- , 'test_precision_{}'.format(tag): tag_results[1]
- , 'test_recall_{}'.format(tag): tag_results[2]})
- print('------')
- data_meta = {'DATASET_PATH': VALIDATION_DATA_PATH
- ,'nrows': regexed_data.shape[0]
- ,'graph_ver': GRAPH_VER
- ,'label': TAGS
- ,'model': 'regex_v{}'.format(GRAPH_VER)
- ,'script_dir': './regex.ipynb'
- ,'task': 'regex evaluation'}
- with dagshub.dagshub_logger() as logger:
- logger.log_hyperparams(data_meta)
- logger.log_metrics(regex_results)
|