Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

common.py 9.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
  1. import re
  2. import numpy as np
  3. import tensorflow as tf
  4. from itertools import takewhile, repeat
  5. from typing import List, Optional, Tuple, Iterable
  6. from datetime import datetime
  7. from collections import OrderedDict
  8. class common:
  9. @staticmethod
  10. def normalize_word(word):
  11. stripped = re.sub(r'[^a-zA-Z]', '', word)
  12. if len(stripped) == 0:
  13. return word.lower()
  14. else:
  15. return stripped.lower()
  16. @staticmethod
  17. def _load_vocab_from_histogram(path, min_count=0, start_from=0, return_counts=False):
  18. with open(path, 'r') as file:
  19. word_to_index = {}
  20. index_to_word = {}
  21. word_to_count = {}
  22. next_index = start_from
  23. for line in file:
  24. line_values = line.rstrip().split(' ')
  25. if len(line_values) != 2:
  26. continue
  27. word = line_values[0]
  28. count = int(line_values[1])
  29. if count < min_count:
  30. continue
  31. if word in word_to_index:
  32. continue
  33. word_to_index[word] = next_index
  34. index_to_word[next_index] = word
  35. word_to_count[word] = count
  36. next_index += 1
  37. result = word_to_index, index_to_word, next_index - start_from
  38. if return_counts:
  39. result = (*result, word_to_count)
  40. return result
  41. @staticmethod
  42. def load_vocab_from_histogram(path, min_count=0, start_from=0, max_size=None, return_counts=False):
  43. if max_size is not None:
  44. word_to_index, index_to_word, next_index, word_to_count = \
  45. common._load_vocab_from_histogram(path, min_count, start_from, return_counts=True)
  46. if next_index <= max_size:
  47. results = (word_to_index, index_to_word, next_index)
  48. if return_counts:
  49. results = (*results, word_to_count)
  50. return results
  51. # Take min_count to be one plus the count of the max_size'th word
  52. min_count = sorted(word_to_count.values(), reverse=True)[max_size] + 1
  53. return common._load_vocab_from_histogram(path, min_count, start_from, return_counts)
  54. @staticmethod
  55. def load_json(json_file):
  56. data = []
  57. with open(json_file, 'r') as file:
  58. for line in file:
  59. current_program = common.process_single_json_line(line)
  60. if current_program is None:
  61. continue
  62. for element, scope in current_program.items():
  63. data.append((element, scope))
  64. return data
  65. @staticmethod
  66. def load_json_streaming(json_file):
  67. with open(json_file, 'r') as file:
  68. for line in file:
  69. current_program = common.process_single_json_line(line)
  70. if current_program is None:
  71. continue
  72. for element, scope in current_program.items():
  73. yield (element, scope)
  74. @staticmethod
  75. def save_word2vec_file(output_file, index_to_word, vocab_embedding_matrix: np.ndarray):
  76. assert len(vocab_embedding_matrix.shape) == 2
  77. vocab_size, embedding_dimension = vocab_embedding_matrix.shape
  78. output_file.write('%d %d\n' % (vocab_size, embedding_dimension))
  79. for word_idx in range(0, vocab_size):
  80. assert word_idx in index_to_word
  81. word_str = index_to_word[word_idx]
  82. output_file.write(word_str + ' ')
  83. output_file.write(' '.join(map(str, vocab_embedding_matrix[word_idx])) + '\n')
  84. @staticmethod
  85. def calculate_max_contexts(file):
  86. contexts_per_word = common.process_test_input(file)
  87. return max(
  88. [max(l, default=0) for l in [[len(contexts) for contexts in prog.values()] for prog in contexts_per_word]],
  89. default=0)
  90. @staticmethod
  91. def binary_to_string(binary_string):
  92. return binary_string.decode("utf-8")
  93. @staticmethod
  94. def binary_to_string_list(binary_string_list):
  95. return [common.binary_to_string(w) for w in binary_string_list]
  96. @staticmethod
  97. def binary_to_string_matrix(binary_string_matrix):
  98. return [common.binary_to_string_list(l) for l in binary_string_matrix]
  99. @staticmethod
  100. def load_file_lines(path):
  101. with open(path, 'r') as f:
  102. return f.read().splitlines()
  103. @staticmethod
  104. def split_to_batches(data_lines, batch_size):
  105. for x in range(0, len(data_lines), batch_size):
  106. yield data_lines[x:x + batch_size]
  107. @staticmethod
  108. def legal_method_names_checker(special_words, name):
  109. return name != special_words.OOV and re.match(r'^[a-zA-Z|]+$', name)
  110. @staticmethod
  111. def filter_impossible_names(special_words, top_words):
  112. result = list(filter(lambda word: common.legal_method_names_checker(special_words, word), top_words))
  113. return result
  114. @staticmethod
  115. def get_subtokens(str):
  116. return str.split('|')
  117. @staticmethod
  118. def parse_prediction_results(raw_prediction_results, unhash_dict, special_words, topk: int = 5) -> List['MethodPredictionResults']:
  119. prediction_results = []
  120. for single_method_prediction in raw_prediction_results:
  121. current_method_prediction_results = MethodPredictionResults(single_method_prediction.original_name)
  122. for i, predicted in enumerate(single_method_prediction.topk_predicted_words):
  123. if predicted == special_words.OOV:
  124. continue
  125. suggestion_subtokens = common.get_subtokens(predicted)
  126. current_method_prediction_results.append_prediction(
  127. suggestion_subtokens, single_method_prediction.topk_predicted_words_scores[i].item())
  128. topk_attention_per_context = [
  129. (key, single_method_prediction.attention_per_context[key])
  130. for key in sorted(single_method_prediction.attention_per_context,
  131. key=single_method_prediction.attention_per_context.get, reverse=True)
  132. ][:topk]
  133. for context, attention in topk_attention_per_context:
  134. token1, hashed_path, token2 = context
  135. if hashed_path in unhash_dict:
  136. unhashed_path = unhash_dict[hashed_path]
  137. current_method_prediction_results.append_attention_path(attention.item(), token1=token1,
  138. path=unhashed_path, token2=token2)
  139. prediction_results.append(current_method_prediction_results)
  140. return prediction_results
  141. @staticmethod
  142. def tf_get_first_true(bool_tensor: tf.Tensor) -> tf.Tensor:
  143. bool_tensor_as_int32 = tf.cast(bool_tensor, dtype=tf.int32)
  144. cumsum = tf.cumsum(bool_tensor_as_int32, axis=-1, exclusive=False)
  145. return tf.logical_and(tf.equal(cumsum, 1), bool_tensor)
  146. @staticmethod
  147. def count_lines_in_file(file_path: str):
  148. with open(file_path, 'rb') as f:
  149. bufgen = takewhile(lambda x: x, (f.raw.read(1024 * 1024) for _ in repeat(None)))
  150. return sum(buf.count(b'\n') for buf in bufgen)
  151. @staticmethod
  152. def squeeze_single_batch_dimension_for_np_arrays(arrays):
  153. assert all(array is None or isinstance(array, np.ndarray) or isinstance(array, tf.Tensor) for array in arrays)
  154. return tuple(
  155. None if array is None else np.squeeze(array, axis=0)
  156. for array in arrays
  157. )
  158. @staticmethod
  159. def get_first_match_word_from_top_predictions(special_words, original_name, top_predicted_words) -> Optional[Tuple[int, str]]:
  160. normalized_original_name = common.normalize_word(original_name)
  161. for suggestion_idx, predicted_word in enumerate(common.filter_impossible_names(special_words, top_predicted_words)):
  162. normalized_possible_suggestion = common.normalize_word(predicted_word)
  163. if normalized_original_name == normalized_possible_suggestion:
  164. return suggestion_idx, predicted_word
  165. return None
  166. @staticmethod
  167. def now_str():
  168. return datetime.now().strftime("%Y%m%d-%H%M%S: ")
  169. @staticmethod
  170. def chunks(l, n):
  171. """Yield successive n-sized chunks from l."""
  172. for i in range(0, len(l), n):
  173. yield l[i:i + n]
  174. @staticmethod
  175. def get_unique_list(lst: Iterable) -> list:
  176. return list(OrderedDict(((item, 0) for item in lst)).keys())
  177. class MethodPredictionResults:
  178. def __init__(self, original_name):
  179. self.original_name = original_name
  180. self.predictions = list()
  181. self.attention_paths = list()
  182. def append_prediction(self, name, probability):
  183. self.predictions.append({'name': name, 'probability': probability})
  184. def append_attention_path(self, attention_score, token1, path, token2):
  185. self.attention_paths.append({'score': attention_score,
  186. 'path': path,
  187. 'token1': token1,
  188. 'token2': token2})
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...