Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

regex.py 4.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
  1. import re
  2. import csv
  3. import json
  4. import argparse
  5. import yaml
  6. import pandas as pd
  7. import numpy as np
  8. from sklearn.metrics import f1_score, precision_score, recall_score
  9. def tokens_search(df, tokens, new_column_name):
  10. df[new_column_name] = 0
  11. for i in range(len(df)):
  12. percents = str(round(100*i/len(df),1))
  13. print(percents + '%\r', end='')
  14. row = df[CODE_COLUMN][i]
  15. for token in tokens:
  16. #print(row, '---', token)
  17. #print('--------')
  18. result = re.search(token.replace('(','\(').replace(')', '\)'), row)
  19. if result!=None:
  20. df[new_column_name][i] = 1
  21. break
  22. return df
  23. evaluation = False
  24. try:
  25. parser = argparse.ArgumentParser()
  26. parser.add_argument("GRAPH_VER", help="version of the graph you want regex to label your CSV with", type=int)
  27. parser.add_argument("DATASET_PATH", help="path to your input CSV", type=str)
  28. parser.add_argument("-eval", "--evaluation", help="evalute regex after creating", type=bool)
  29. args = parser.parse_args()
  30. GRAPH_VER = args.GRAPH_VER
  31. DATASET_PATH = args.DATASET_PATH
  32. if args.evaluation is not None:
  33. evaluation = args.evaluation
  34. except:
  35. print('Got no arguments, taking default arguments from params.yaml')
  36. with open("params.yaml", 'r') as fd:
  37. params = yaml.safe_load(fd)
  38. GRAPH_VER = params['GRAPH_VER']
  39. DATASET_PATH = params['regex']['DATASET_PATH']
  40. if params['regex']['evaluation'] is not None:
  41. evaluation = params['evaluation']
  42. OUTPUT_DATASET_PATH = '{}_regex_graph_v{}.csv'.format(DATASET_PATH[:-4], GRAPH_VER)
  43. CODE_COLUMN = 'code_block'
  44. GRAPH_DIR = '../graph/graph_v{}.txt'.format(GRAPH_VER)
  45. if __name__ == '__main__':
  46. df = pd.read_csv(DATASET_PATH, encoding='utf-8', sep=',')
  47. print('opened input data')
  48. if df[CODE_COLUMN].isna().sum() > 0:
  49. print('Empty chunks found: {}'.format(df[CODE_COLUMN].isna().sum()))
  50. df = df.dropna(subset=[CODE_COLUMN]).reset_index()
  51. with open(GRAPH_DIR, "r") as graph_file:
  52. graph = json.load(graph_file)
  53. print('opened graph')
  54. vertices = []
  55. for i in range(0, len(graph)):
  56. vertex = list(graph.keys())[i]
  57. vertices.append(vertex)
  58. print('\n creating labels for {}'.format(vertex))
  59. tokens = graph[vertex]
  60. df = tokens_search(df, tokens, vertex)
  61. print('labelled')
  62. df.to_csv(OUTPUT_DATASET_PATH, index=False)
  63. print('saved and finished')
  64. if evaluation:
  65. print('evaluating regex v{}'.format(GRAPH_VER))
  66. VALIDATION_DATA_PATH = "../markup_data.csv"
  67. TAGS = vertices
  68. REGEX_TAGS = [el + '_regex_v{}'.format(GRAPH_VER) for el in TAGS]
  69. regexed_data = pd.read_csv(VALIDATION_DATA_PATH)
  70. print('loaded validation data')
  71. for i, tag in enumerate(TAGS):
  72. print('\n creating labels for {}'.format(tag))
  73. tokens = graph[tag]
  74. regexed_data = tokens_search(regexed_data, tokens, REGEX_TAGS[i])
  75. print('val data labelled')
  76. Y_test, Y_pred = regexed_data[TAGS], regexed_data[REGEX_TAGS]
  77. base_f1 = f1_score(Y_test, Y_pred, average='weighted')
  78. base_precision = precision_score(Y_test, Y_pred, average='weighted')
  79. base_recall = recall_score(Y_test, Y_pred, average='weighted')
  80. regex_results = {'test_f1_score': base_f1
  81. , 'test_precision': base_precision
  82. , 'test_recall': base_recall}
  83. for i, tag in enumerate(TAGS):
  84. tag_results = (round(f1_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
  85. round(precision_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4),\
  86. round(recall_score(Y_test.iloc[:, i], Y_pred.iloc[:, i], average='weighted'),4))
  87. print(tag)
  88. print(tag_results)
  89. regex_results.update({'test_f1_score_{}'.format(tag): tag_results[0]
  90. , 'test_precision_{}'.format(tag): tag_results[1]
  91. , 'test_recall_{}'.format(tag): tag_results[2]})
  92. print('------')
  93. data_meta = {'DATASET_PATH': VALIDATION_DATA_PATH
  94. ,'nrows': regexed_data.shape[0]
  95. ,'graph_ver': GRAPH_VER
  96. ,'label': TAGS
  97. ,'model': 'regex_v{}'.format(GRAPH_VER)
  98. ,'script_dir': './regex.ipynb'
  99. ,'task': 'regex evaluation'}
  100. print("metrics:", regex_results)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...