Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

#6 build(deps): bump tqdm from 4.66.1 to 4.66.3

Open
GitHub User wants to merge 1 commits into ncusi:main from ncusi:dependabot/pip/tqdm-4.66.3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
  1. import copy
  2. import re
  3. from difflib import SequenceMatcher, get_close_matches
  4. from operator import itemgetter
  5. from unidiff import PatchSet, PatchedFile, Hunk
  6. from joblib import Parallel, delayed
  7. def to_str(self):
  8. content = "".join(str(line) for line in self)
  9. return content
  10. def hunk_to_str(self):
  11. """Extract string to be used for comparison with Hunk from unidiff.PatchSet
  12. This `self` hunk can synthetic Hunk created as container for unidiff.patch.Line's
  13. by the get_hunk_images() function.
  14. :param Hunk self: modified block of a file, may be synthetic
  15. :return: all lines of changes in hunk, with their prefix removed,
  16. concatenated into a single string
  17. :rtype: str
  18. """
  19. return "".join(line.value for line in self)
  20. # monkey-patching Hunk
  21. Hunk.to_str = hunk_to_str
  22. def get_close_matches2(word, pos, n, cutoff = 0.5):
  23. match_size = 0
  24. match_word = ""
  25. for p in pos:
  26. m = SequenceMatcher(a=word, b=p).find_longest_match()
  27. if m.size > match_size:
  28. match_size = m.size
  29. match_word = p
  30. if match_size == 0 or (match_size / len(word)) < cutoff:
  31. return None
  32. return match_word
  33. def get_close_matches3(word, pos, n, cutoff):
  34. match_size = 0
  35. match_word = ""
  36. for p in pos:
  37. m = SequenceMatcher(a=word, b=p)
  38. if m.quick_ratio() > match_size:
  39. r = m.ratio()
  40. if r > match_size:
  41. match_size = r
  42. match_word = p
  43. if r >= cutoff and len(match_word.strip()) > 0:
  44. return match_word
  45. if match_size == 0 or match_size < cutoff or len(match_word.strip()) == 0:
  46. return None
  47. return match_word
  48. class CompareBase:
  49. def __init__(self, image, lines=False):
  50. self.pos = {"r": 0, "p": None}
  51. self.lines = lines
  52. if lines:
  53. self.pos["l"] = None
  54. self.rnewline = re.compile(r"\n")
  55. self.image = image
  56. self.simage = image.to_str()
  57. # sequence matcher and maximal match
  58. self.seq_match = None
  59. self.chats = []
  60. def __repr__(self):
  61. return str(self.pos)
  62. class CompareLines(CompareBase):
  63. def __init__(self, image, lines=False, threshold=0.5):
  64. super().__init__(image, lines)
  65. self.threshold = threshold
  66. def compare(self, b, pno, lno=None):
  67. a = self.simage
  68. self.chats.append(b)
  69. # alternate version
  70. def final(self):
  71. chatl = []
  72. for chat in self.chats:
  73. #chatl.extend([s.strip() for s in chat.splitlines() if len(s.strip())>0])
  74. chatl.extend(chat.splitlines())
  75. ret = []
  76. for line in self.image:
  77. if len(str(line).strip())==0:
  78. continue
  79. m = get_close_matches3(str(line), chatl, 1, self.threshold)
  80. if m:
  81. ret.append(line.diff_line_no)
  82. return ret
  83. class CompareLinesFragmentThreshold(CompareLines):
  84. def __init__(self, image, lines=False):
  85. super().__init__(image, lines)
  86. def compare(self, b, pno, lno=None):
  87. a = self.simage
  88. s = SequenceMatcher(None, a, b)
  89. # Threshold version
  90. if s.real_quick_ratio() >= 0.1 and s.quick_ratio() >= 0.1 and s.ratio() >= 0.1:
  91. self.seq_match = s
  92. self.chats.append(b)
  93. class CompareTopFragments(CompareBase):
  94. # Work in progress
  95. def __init__(self, image, lines=False):
  96. super().__init__(image, lines)
  97. def compare(self, b, pno, lno=None):
  98. a = self.simage
  99. s = SequenceMatcher(None, a, b)
  100. # Max version
  101. if s.real_quick_ratio() >= self.pos["r"] and s.quick_ratio() >= self.pos["r"]:
  102. r = s.ratio()
  103. if r > self.pos["r"]:
  104. self.pos = {"r": r, "p": pno}
  105. self.seq_match = s
  106. self.chat = b
  107. if self.lines:
  108. self.pos["l"] = lno
  109. class CompareFragments(CompareBase):
  110. def __init__(self, image, lines=False):
  111. """Construct a CompareFragments
  112. :param Hunk image: pre-image or post-image hunk of commit diff;
  113. unidiff.Hunk is monkey-patched to have `to_str` attribute (method).
  114. :param bool lines: whether to remember `lno` in compare()
  115. """
  116. # noinspection PyTypeChecker
  117. super().__init__(image, lines)
  118. self.chat = ""
  119. def compare(self, b, pno, lno=None):
  120. """Compare pre-image or post-image hunk against ChatGPT text
  121. :param str b: text of element of ChatGPT conversation
  122. (prompt, or answer, or code block)
  123. :param int pno: index into 'Conversations' list of 'ChatgptSharing'
  124. :param int or None lno: index into 'ListOfCode' list of conversation
  125. :rtype: None
  126. """
  127. a = self.simage
  128. s = SequenceMatcher(None, a, b)
  129. # Max version
  130. if s.real_quick_ratio() >= self.pos["r"] and s.quick_ratio() >= self.pos["r"]:
  131. r = s.ratio()
  132. if r > self.pos["r"]:
  133. self.pos = {"r": r, "p": pno}
  134. self.seq_match = s
  135. self.chat = b
  136. if self.lines:
  137. self.pos["l"] = lno
  138. # Max version
  139. def final(self, cutoff=0.6,
  140. ret_chat_line_no=False, ret_score=False):
  141. """Final result of sequence of compare()'s
  142. :param float cutoff: a float in the range [0, 1], default 0.5.
  143. Lines from ChatGPT that don’t score at least that similar to patch line
  144. are ignored.
  145. :param bool ret_chat_line_no: whether to add chat_line_no to output
  146. :param bool ret_score: whether to add similarity score to output
  147. :return: list of diff line numbers of those lines in the `self.image` Hunk
  148. that have at least 1 matching line with at least `cutoff` similarity
  149. in one of compared chat fragments (prompt, or answer, or code block),
  150. or list of tuples that include diff line number as first element
  151. :rtype: list[int] or list[tuple[int, int]] or list[tuple[int, float]] or list[tuple[int, int, float]]
  152. """
  153. if not self.seq_match:
  154. return []
  155. chat_lines = self.chat.splitlines()
  156. ret = []
  157. for line in self.image:
  158. line_s = getattr(line, 'value', str(line)).rstrip('\n')
  159. # skip empty lines; str(line) for adding empty line is '+\n', so it does not match ''
  160. if not line_s:
  161. continue
  162. m = get_close_matches(line_s, chat_lines, n=1, cutoff=cutoff)
  163. if m:
  164. res = line.diff_line_no
  165. if ret_chat_line_no:
  166. chat_line_no = [line_no
  167. for line_no, line in enumerate(chat_lines)
  168. if line == m[0]][0]
  169. res = (res, chat_line_no)
  170. if ret_score:
  171. # following source of get_close_matches() in difflib library
  172. # https://github.com/python/cpython/blob/main/Lib/difflib.py
  173. # s.set_seq2(word), s.set_seq1(possibilities[i])
  174. s = SequenceMatcher(a=m[0], b=line_s) # a ≡ s.set_seq1, b ≡ s.set_seq2
  175. r = s.ratio()
  176. if isinstance(res, tuple):
  177. res = (*res, r)
  178. else:
  179. res = (res, r)
  180. ret.append(res)
  181. return ret
  182. def get_hunk_images(hunk):
  183. """Split chunk into pre-image and post-image Hunk
  184. Second Hunk in the returned tuple includes only added files;
  185. all the other lines are returned in first Hunk in the tuple.
  186. Note that those returned synthesized chunks may lack correct
  187. header information - they are used only as containers for patch.Line.
  188. :param Hunk hunk: original part of diff, includes added, removed, and context lines
  189. :return: "preimage" and "postimage" hunks
  190. :rtype: (Hunk, Hunk)
  191. """
  192. postimage = Hunk()
  193. preimage = Hunk()
  194. for line in hunk:
  195. lc = copy.copy(line)
  196. if line.is_added:
  197. postimage.append(lc)
  198. else:
  199. preimage.append(lc)
  200. return preimage, postimage
  201. def get_max_coverage(image, conv, Compare = CompareFragments,
  202. ret_chat_line_no=False, ret_score=False):
  203. """
  204. Returns dict with the following structure:
  205. {
  206. "P": <comparison of `hunk` with "Prompt">,
  207. "A": <comparison of `hunk` with "Answer">,
  208. "L": <comparison of `hunk` with "ListOfCode">,
  209. }
  210. :param Hunk image: modified block of file, changed by diff;
  211. might be synthesized hunk returned by :func:`get_hunk_images`
  212. :param dict conv: "Conversation" part of `ChatgptSharing` structure,
  213. see https://github.com/NAIST-SE/DevGPT/blob/main/README.md#conversations
  214. :param type[CompareBase] Compare: compare class to use
  215. :param bool ret_chat_line_no: whether to add chat_line_no to output
  216. :param bool ret_score: whether to add similarity score to output
  217. :return:
  218. :rtype: dict[str, list[int]]
  219. """
  220. # iterate over conversation
  221. m_answer = Compare(image)
  222. m_prompt = Compare(image)
  223. m_loc = Compare(image, lines=True)
  224. for pno, prompt in enumerate(conv):
  225. a, b = prompt["Prompt"], prompt["Answer"]
  226. m_prompt.compare(a, pno)
  227. m_answer.compare(b, pno)
  228. for lno, loc in enumerate(prompt["ListOfCode"]):
  229. m_loc.compare(loc["Content"], pno, lno)
  230. return {
  231. # among 'Prompt'
  232. "P": m_prompt.final(ret_chat_line_no=ret_chat_line_no, ret_score=ret_score),
  233. "p": m_prompt.pos,
  234. # among 'Answer'
  235. "A": m_answer.final(ret_chat_line_no=ret_chat_line_no, ret_score=ret_score),
  236. "a": m_answer.pos,
  237. # among 'ListOfCode'
  238. "L": m_loc.final(ret_chat_line_no=ret_chat_line_no, ret_score=ret_score),
  239. "l": m_loc.pos,
  240. }
  241. def diff_to_conversation_file(file, conv, debug=False, compare=CompareFragments):
  242. """
  243. :param PatchedFile file: file updated by `diff`, it is a list of Hunk's
  244. :param dict conv: ChatGPT link mention as `ChatgptSharing` structure,
  245. see https://github.com/NAIST-SE/DevGPT/blob/main/README.md#chatgptsharing
  246. :param bool debug: return also data about individual files
  247. :param type[CompareBase] compare: compare class to use
  248. :return:
  249. rtype dict[str, dict[str, int | set] | dict]
  250. """
  251. ret = {
  252. "ALL": {
  253. "coverage": 0,
  254. "all": 0,
  255. "lines": set(),
  256. "preimage": set(),
  257. "preimage_all": 0,
  258. "preimage_coverage": 0,
  259. }
  260. }
  261. fn = file.path
  262. if debug:
  263. ret["FILE"] = (file.source_file, file.target_file)
  264. ret["PATH"] = fn
  265. for i, hunk in enumerate(file):
  266. preimage, postimage = get_hunk_images(hunk)
  267. pre = get_max_coverage(preimage, conv["Conversations"], Compare=compare,
  268. ret_chat_line_no=debug, ret_score=debug)
  269. post = get_max_coverage(postimage, conv["Conversations"], Compare=compare,
  270. ret_chat_line_no=debug, ret_score=debug)
  271. # Only 'Answer' and 'ListOfCode' for post
  272. ret_lines = []
  273. ret_lines.extend(map(itemgetter(0), post["A"]) if debug else post["A"])
  274. ret_lines.extend(map(itemgetter(0), post["L"]) if debug else post["L"])
  275. ret_lines = set(ret_lines)
  276. # TODO: check how many remove lines from 'P'
  277. # that are exactly the same as in 'A' + 'L'.
  278. # this has to be done on source lines and may be expensive
  279. # ret_lines = set(ret_lines).union(set(post['P']))
  280. ret["ALL"]["coverage"] += len(ret_lines)
  281. #ret["ALL"]["all"] += len([l for l in postimage if len(str(l).strip())>0])
  282. ret["ALL"]["all"] += len(postimage)
  283. ret["ALL"]["lines"] = ret["ALL"]["lines"].union(ret_lines)
  284. # Only 'Prompt' for pre
  285. pre_set = set(map(itemgetter(0), pre["P"]) if debug else pre["P"])
  286. ret["ALL"]["preimage"] = ret["ALL"]["preimage"].union(pre_set)
  287. ret["ALL"]["preimage_coverage"] += len(pre_set)
  288. ret["ALL"]["preimage_all"] += len(preimage)
  289. if debug:
  290. if "HUNKS" not in ret:
  291. ret["HUNKS"] = {}
  292. ret["HUNKS"][i] = {
  293. "pre": pre,
  294. "post": post,
  295. "lines": list(ret_lines),
  296. }
  297. return ret
  298. def diff_to_conversation(diff, conv, debug=False, compare = CompareFragments):
  299. """
  300. :param PatchSet diff: result of running GitRepo.unidiff(), it is a list of PatchedFile's
  301. :param dict conv: ChatGPT link mention as `ChatgptSharing` structure,
  302. see https://github.com/NAIST-SE/DevGPT/blob/main/README.md#chatgptsharing
  303. :param bool debug: passed down to :func:`diff_to_conversation_file`
  304. :param type[CompareBase] compare: compare class to use
  305. :return:
  306. :rtype: dict[str, dict[str, int | list]]
  307. """
  308. ret = {}
  309. ret["ALL"] = {"coverage": 0, "all": 0, "lines": [], 'preimage':[], 'preimage_all':0, 'preimage_coverage':0}
  310. if debug:
  311. ret["ALL"]["debug"] = True
  312. if "Conversations" not in conv:
  313. return ret
  314. ret_list =[]
  315. #for file in diff:
  316. # ret_list.append(diff_to_conversation_file(file, diff, conv, debug, compare))
  317. ret_list = Parallel(n_jobs=-1)(delayed(diff_to_conversation_file)(file, conv, debug, compare) for file in diff)
  318. for r in ret_list:
  319. ret["ALL"]["coverage"] += r['ALL']["coverage"]
  320. ret["ALL"]["all"] += r['ALL']["all"]
  321. ret["ALL"]["preimage_coverage"] += r['ALL']["preimage_coverage"]
  322. ret["ALL"]["preimage_all"] += r['ALL']["preimage_all"]
  323. ret["ALL"]["lines"].extend(r['ALL']["lines"])
  324. ret["ALL"]["preimage"].extend(r['ALL']["preimage"])
  325. if debug:
  326. # r might be {} if there were errors
  327. if 'PATH' in r:
  328. filename = r['PATH']
  329. if 'FILES' not in ret:
  330. ret['FILES'] = {}
  331. ret['FILES'][filename] = {
  332. key: value
  333. for key, value in r.items()
  334. if key in ['FILE', 'HUNKS']
  335. }
  336. return ret
Discard
Tip!

Press p or to see the previous file or, n or to see the next file