Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

regex.py 3.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  1. import re
  2. import csv
  3. import json
  4. import argparse
  5. import pandas as pd
  6. import numpy as np
  7. from sklearn.metrics import f1_score, precision_score, recall_score
  8. import dagshub
  9. def tokens_search(df, tokens, new_column_name):
  10. df[new_column_name] = 0
  11. for i in range(len(df)):
  12. percents = str(round(100*i/len(df),1))
  13. print(percents + '%\r', end='')
  14. row = df[CODE_COLUMN][i]
  15. for token in tokens:
  16. result = re.search(token.replace('(','\('), row)
  17. if result!=None:
  18. df[new_column_name][i] = 1
  19. break
  20. return df
  21. parser = argparse.ArgumentParser()
  22. parser.add_argument("GRAPH_VER", help="version of the graph you want regex to label your CSV with", type=int)
  23. parser.add_argument("DATASET_PATH", help="path to your input CSV", type=str)
  24. parser.add_argument("-eval", "--evaluation", help="evalute regex after creating", type=bool)
  25. args = parser.parse_args()
  26. GRAPH_VER = args.GRAPH_VER
  27. DATASET_PATH = args.DATASET_PATH
  28. evaluation = False
  29. if args.evaluation is not None:
  30. evaluation = args.evaluation
  31. OUTPUT_DATASET_PATH = '{}_regex_graph_v{}.csv'.format(DATASET_PATH[:-4], GRAPH_VER)
  32. CODE_COLUMN = 'code_block'
  33. GRAPH_DIR = './graph/graph_v{}.txt'.format(GRAPH_VER)
  34. if __name__ == '__main__':
  35. df = pd.read_csv(DATASET_PATH, encoding='utf-8', sep=',')
  36. print('opened input data')
  37. if df[CODE_COLUMN].isna().sum() > 0:
  38. print('Empty chunks found: {}'.format(df[CODE_COLUMN].isna().sum()))
  39. df = df.dropna(subset=[CODE_COLUMN]).reset_index()
  40. with open(GRAPH_DIR, "r") as graph_file:
  41. graph = json.load(graph_file)
  42. print('opened graph')
  43. vertices = []
  44. for i in range(0, len(graph)):
  45. vertex = list(graph.keys())[i]
  46. vertices.append(vertices)
  47. print('\n' + vertex)
  48. tokens = graph[vertex]
  49. df = tokens_search(df, tokens, vertex)
  50. print('labelled')
  51. df.to_csv(OUTPUT_DATASET_PATH, index=False)
  52. print('saved and finished')
  53. if evaluation:
  54. VALIDATION_DATA_PATH = "./data/golden_884_set.csv"
  55. TAGS = vertices
  56. REGEX_TAGS = [el + '_regex_v{}'.format(GRAPH_VER) for el in TAGS]
  57. regexed_data = pd.read_csv(VALIDATION_DATA_PATH)
  58. Y_test, Y_pred = regexed_data[TAGS], regexed_data[REGEX_TAGS]
  59. base_f1 = f1_score(Y_test, Y_pred, average='weighted')
  60. base_precision = precision_score(Y_test, Y_pred, average='weighted')
  61. base_recall = recall_score(Y_test, Y_pred, average='weighted')
  62. regex_results = {'test_f1_score': base_f1
  63. , 'test_precision': base_precision
  64. , 'test_recall': base_recall}
  65. for i, tag in enumerate(TAGS):
  66. tag_results = (round(f1_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
  67. round(precision_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
  68. round(recall_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4))
  69. print(tag)
  70. print(tag_results)
  71. regex_results.update({'test_f1_score_{}'.format(tag): tag_results[0]
  72. , 'test_precision_{}'.format(tag): tag_results[1]
  73. , 'test_recall_{}'.format(tag): tag_results[2]})
  74. print('------')
  75. data_meta = {'DATASET_PATH': VALIDATION_DATA_PATH
  76. ,'nrows': regexed_data.shape[0]
  77. ,'graph_ver': GRAPH_VER
  78. ,'label': TAGS
  79. ,'model': 'regex_v{}'.format(GRAPH_VER)
  80. ,'script_dir': './regex.ipynb'
  81. ,'task': 'regex evaluation'}
  82. with dagshub.dagshub_logger() as logger:
  83. logger.log_hyperparams(data_meta)
  84. logger.log_metrics(regex_results)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...