Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

compute_changes_survival.py 23 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """Usage: {script_name} <commit_sharings_df> <repositories.json> <output_commit_df> <output_lines_df>
  4. Compute survival of changed lines for each commit in the <commit_sharings_df>,
  5. using cloned repositories (as described by <repositories.json>) and the
  6. reverse blame.
  7. While at it, add some commit metadata to the dataframe. This information
  8. is gathered into dataframe and saved in the <output_commit_df>.
  9. Information about the fate of each post-image changed line ("added" line in
  10. the unified diff of commit changes), for example in which commit it vanished
  11. if it did vanish, is gathered into dataframe and saved in the <output_lines_df>.
  12. See docstring for :func:`process_single_commit` to find columns added to original
  13. <commit_sharings_df> columns in <output_commit_df>, and docstring for
  14. :func:`process_commit_changed_lines` to find columns in <output_lines_df>.
  15. Example:
  16. python scripts/data/compute_changes_survival.py \\
  17. data/interim/commit_sharings_df.csv data/repositories_download_status.json \\
  18. data/interim/commit_sharings_changes_survival_df.csv \\
  19. data/interim/commit_sharings_lines_survival_df.csv
  20. """
  21. import subprocess
  22. import sys
  23. from collections import defaultdict, Counter
  24. from pathlib import Path
  25. from typing import Optional, Tuple, List, NamedTuple
  26. import pandas as pd
  27. import unidiff
  28. from tqdm import tqdm
  29. from src.data.common import load_repositories_json, reponame_to_repo_path
  30. from src.utils.functools import timed
  31. from src.utils.git import GitRepo, changes_survival_perc
  32. # constants
  33. ERROR_ARGS = 1
  34. ERROR_OTHER = 2
  35. class GptCommitInfo(NamedTuple):
  36. """Return value for process_single_commit() function"""
  37. curr_commit_info: dict
  38. line_survival_info: Optional[dict] = None
  39. blamed_commits_data: Optional[dict] = None
  40. def process_single_commit(repo: GitRepo,
  41. project_name: str, gpt_commit: str,
  42. process_stats: dict) -> GptCommitInfo:
  43. """Process single commit from DevGPT dataset, computing its survival info
  44. Using reverse blame for each of lines "added" by the patch of `gpt_commit`,
  45. find if some of them vanish at some point, or if they all survive to present
  46. day (until HEAD). The result of reverse blame is added to the return
  47. value of this function, together with data about the commit it computed
  48. or extracted.
  49. This function returns GptCommitInfo named tuple, where first field,
  50. `curr_commit_info`, contains information about the processed commit,
  51. `line_survival_info` comes directly from repo.changes_survival(),
  52. and `blamed_commits_data` is information about blamed commits
  53. from repo.changes_survival() post-processed to be a dict with
  54. commits SHA-1 identifiers as keys, and commit data as values.
  55. The data in `curr_commit_info` has the following structure:
  56. - 'Sha': SHA-1 identifier of commit from DevGPT, to be used for join
  57. with the <commit_sharings_df> data
  58. - 'author_timestamp': Unix timestamp of when `Sha` commit was authored,
  59. should be same date as in `AuthorAt` field in DevGPT dataset
  60. - 'committer_timestamp': Unix timestamp of when `Sha` commit was
  61. committed to repo, should be the same date as `CommitAt` from DevGPT
  62. - 'n_parents': number of `Sha` commit parents, to distinguish merge
  63. and root commits
  64. - 'is_merged_HEAD': boolean value denoting whether `Sha` is merged
  65. into HEAD, or in other words whether HEAD codeline contains `Sha`
  66. - `error`: boolean value, whether there were errors while trying to
  67. compute reverse blame for the commit; if True all following fields
  68. will be missing (will be N/A in the dataframe)
  69. - 'change_lines_survived': number of lines in post-image that survived
  70. until present day (until HEAD)
  71. - 'change_lines_total': total number of lines in post-image of `Sha`
  72. ("added" lines in unified diff of `Sha` commit changes)
  73. - 'min_died_committer_timestamp': Unix timestamp of earliest date
  74. when first line of `Sha` post-image changes vanished; missing
  75. if all change lines suvived
  76. :param GitRepo repo: local, cloned `project_name` repository
  77. :param str project_name: name of the project (full name on GitHub)
  78. e.g. "sqlalchemy/sqlalchemy"
  79. :param str gpt_commit: commit from DevGPT dataset, for example one
  80. where its commit message includes ChatGPT sharing link
  81. :param dict process_stats: used to gather statistics about the process
  82. :return: data about the commit, and reverse blame info
  83. :rtype: GptCommitInfo
  84. """
  85. try:
  86. commit_metadata = repo.get_commit_metadata(gpt_commit)
  87. except subprocess.CalledProcessError as err:
  88. tqdm.write("ERROR when calling repo.get_commit_metadata(gpt_commit)")
  89. tqdm.write(f"{err=}")
  90. if hasattr(err, 'stderr') and err.stderr:
  91. tqdm.write(f"-----\n{err.stderr}\n-----")
  92. tqdm.write("Exiting...")
  93. sys.exit(ERROR_OTHER)
  94. augment_curr = {
  95. 'Sha': gpt_commit, # to be used for join
  96. 'Sha_is_valid': True,
  97. 'author_timestamp': commit_metadata['author']['timestamp'],
  98. 'committer_timestamp': commit_metadata['committer']['timestamp'],
  99. 'n_parents': len(commit_metadata['parents']),
  100. }
  101. is_merged = repo.check_merged_into(gpt_commit, 'HEAD')
  102. augment_curr['is_merged_HEAD'] = bool(is_merged)
  103. if not is_merged:
  104. # TODO: add to lines_data even if commit is not merged into HEAD
  105. # (currently, so far all commits are found to be merged)
  106. process_stats['n_unmerged'] += 1
  107. return GptCommitInfo(augment_curr)
  108. # at this point we know that HEAD contains gpt_commit
  109. commits_from_HEAD = repo.count_commits(until_commit=gpt_commit)
  110. augment_curr['number_of_commits_from_HEAD'] = commits_from_HEAD
  111. try:
  112. commits_data, survival_info = repo.changes_survival(gpt_commit)
  113. augment_curr['error'] = False
  114. except subprocess.CalledProcessError as err:
  115. tqdm.write(f"{err=}")
  116. if hasattr(err, 'stderr') and err.stderr:
  117. tqdm.write(f"-----\n{err.stderr}\n-----")
  118. augment_curr['error'] = True
  119. process_stats['n_errors'] += 1
  120. return GptCommitInfo(augment_curr)
  121. except unidiff.UnidiffParseError as err:
  122. tqdm.write(f"Project '{project_name}', commit {gpt_commit}\n"
  123. f" at '{repo!s}'")
  124. tqdm.write(f"{err=}")
  125. augment_curr['error'] = True
  126. process_stats['n_errors'] += 1
  127. return GptCommitInfo(augment_curr)
  128. lines_survived, lines_total = changes_survival_perc(survival_info)
  129. augment_curr.update({
  130. 'change_lines_survived': lines_survived,
  131. 'change_lines_total': lines_total,
  132. })
  133. process_stats['lines_survived_sum'] += lines_survived
  134. process_stats['lines_total_sum'] += lines_total
  135. # TODO: extract this into separate function
  136. all_blame_commit_data = None
  137. if lines_survived < lines_total:
  138. survived_until = []
  139. all_blame_commit_data = {}
  140. for change_path_data in commits_data.values():
  141. all_blame_commit_data.update(change_path_data)
  142. for change_path_data in commits_data.values():
  143. for blame_commit_data in change_path_data.values():
  144. if 'previous' in blame_commit_data:
  145. blame_prev = blame_commit_data['previous'].split(' ')[0]
  146. if blame_prev in all_blame_commit_data:
  147. blame_prev_timestamp = int(all_blame_commit_data[blame_prev]['committer-time'])
  148. else:
  149. blame_prev_timestamp = repo.get_commit_metadata(blame_prev)['committer']['timestamp']
  150. survived_until.append(blame_prev_timestamp)
  151. # DEBUGGING for 'min_died_committer_timestamp'
  152. # tqdm.write(f"* {project_name} {gpt_commit[:8]} changes died at {sorted(survived_until)}")
  153. if survived_until: # is not empty
  154. augment_curr['min_died_committer_timestamp'] = min(survived_until)
  155. return GptCommitInfo(curr_commit_info=augment_curr,
  156. line_survival_info=survival_info,
  157. blamed_commits_data=all_blame_commit_data)
  158. def process_commit_changed_lines(repo: GitRepo, url: str,
  159. project_name: str, gpt_commit: str,
  160. survival_info: dict, blamed_commits_info: dict,
  161. process_stats: dict) -> List[dict]:
  162. """Compute survival for each change (post-image) line in given commit
  163. Using reverse blame for each of lines "added" by the patch of `gpt_commit`,
  164. find if they vanish at some point, or if they survive to present day
  165. (until HEAD).
  166. Fill in information about relevant commits related to reverse history
  167. of the line: last commit that line was seen in the same form as in
  168. `gpt_commit`, first commit that does not have the line in question
  169. (if there is such commit), and of course about the starting commit:
  170. `gpt_commit`. The part of per-commit information that is needed for
  171. survival analysis is the timestamp (the author timestamp, and the
  172. committer timestamp).
  173. The output of 'git blame --reverse' uses the same nomenclature, the
  174. same terms, as ordinary 'git blame' - which is much more common, and
  175. which was created first.
  176. The returned data has the following structure (dicts on the list
  177. have the following keys):
  178. - 'URL': URL to the mentioned source (that includes ChatGPT link)
  179. - 'RepoName': full name of repository, from DevGPT dataset, in which
  180. commit identified `Sha` can be found
  181. - 'Sha': SHA-1 identifier of commit from DevGPT, can be used for join
  182. (the same field name as in DevGPT dataset files)
  183. - 'Sha_filename': post-image name of file changed by `Sha` commit;
  184. only those files changed by `Sha` commits for which there is
  185. non-empty post-image are included in the "dataframe"
  186. - 'Sha_lineno': line number in `Sha_filename` at `Sha`
  187. - 'last_commit': SHA-1 identifier of the last commit (in chronological
  188. order starting from `Sha`) where given line still exists
  189. - 'last_filename': file in which the line is at `last_commit`,
  190. taking into account code movement and code copying, if 'git blame'
  191. was configured to consider those
  192. - 'last_lineno': line number in `last_filename` at `last_commit`
  193. - 'line': the contents of the line (present both in `Sha` and
  194. `last_commit`)
  195. - 'diff_line_no': line number in diff of `Sha` unidiff of changes
  196. - 'next_commit': SHA-1 identifier of the next commit after `last_commit`,
  197. i.e. commit that has `last_commit` as a parent, and which do not
  198. contain the line in question any longer; might be N/A if line
  199. survived until present (until HEAD)
  200. - 'next_filename': name of `last_filename` in `next_commit`,
  201. taking into account file renames if 'git blame' was configured
  202. to do so; it there is no `next_commit` it is None / N/A
  203. - 'Sha_author_timestamp', 'Sha_committer_timestamp': Unix timestamp
  204. of when `Sha` commit was authored, and when it was committed to repo
  205. - 'last_author_timestamp', 'last_committer_timestamp': as above,
  206. but for `last_commit` commit
  207. - 'next_author_timestamp', 'next_committer_timestamp': as above,
  208. but for `next_commit` commit, it it exists, else None / N/A
  209. :param GitRepo repo: local, cloned `project_name` repository
  210. :param str url: URL of mentioned source (from DevGPT),
  211. e.g. "https://github.com/sqlalchemy/sqlalchemy/commit/0df9759b73cb20818a35bd182697fac36dda3484"
  212. :param str project_name: name of the project (full name on GitHub)
  213. e.g. "sqlalchemy/sqlalchemy"
  214. :param str gpt_commit: commit from DevGPT dataset, for example one
  215. where its commit message includes ChatGPT sharing link
  216. :param dict survival_info: reverse blame information about lines,
  217. generated by repo.changes_survival() method
  218. :param dict blamed_commits_info: information about blamed commits,
  219. gathered per-project from different reverse blame runs
  220. :param dict process_stats: used to gather statistics about the process
  221. :return: information about lines lifetime, in a format suitable for
  222. converting into dataframe with pd.DataFrame.from_records()
  223. :rtype: list[dict]
  224. """
  225. lines_data = []
  226. for change_path, change_lines_list in survival_info.items():
  227. for change_line_info in change_lines_list:
  228. if 'previous' in change_line_info:
  229. prev_commit, prev_file = change_line_info['previous'].split(' ', maxsplit=1)
  230. change_line_info['previous_commit'] = prev_commit
  231. change_line_info['previous_filename'] = prev_file
  232. for sha_key in 'Sha', 'commit', 'previous_commit':
  233. if sha_key == 'previous_commit' and sha_key not in change_line_info:
  234. continue
  235. if sha_key == 'Sha':
  236. commit_sha = gpt_commit
  237. else:
  238. commit_sha = change_line_info[sha_key]
  239. # TODO: create a function or transformation to remove this code duplication
  240. if commit_sha in blamed_commits_info:
  241. process_stats[f"{sha_key}_metadata_from_blame"] += 1
  242. change_line_info[f"{sha_key}_author_timestamp"] \
  243. = int(blamed_commits_info[commit_sha]['author-time'])
  244. change_line_info[f"{sha_key}_committer_timestamp"] \
  245. = int(blamed_commits_info[commit_sha]['committer-time'])
  246. else:
  247. process_stats[f"{sha_key}_metadata_from_repo"] += 1
  248. commit_metadata = repo.get_commit_metadata(commit_sha)
  249. change_line_info[f"{sha_key}_author_timestamp"] \
  250. = commit_metadata['author']['timestamp']
  251. change_line_info[f"{sha_key}_committer_timestamp"] \
  252. = commit_metadata['committer']['timestamp']
  253. # use blamed_commits_info as cache
  254. blamed_commits_info[commit_sha] = {
  255. 'author-time': commit_metadata['author']['timestamp'],
  256. 'committer-time': commit_metadata['committer']['timestamp'],
  257. }
  258. lines_data.append({
  259. # the same field names as used in DevGPT dataset
  260. 'URL': url,
  261. 'RepoName': project_name,
  262. 'Sha': gpt_commit,
  263. # field names renamed to be more meaningful
  264. 'Sha_filename': change_path,
  265. 'Sha_lineno': change_line_info['final'],
  266. 'last_commit': change_line_info['commit'],
  267. 'last_filename': change_line_info['original_filename'],
  268. 'last_lineno': change_line_info['original'],
  269. 'line': change_line_info['line'],
  270. 'diff_line_no': change_line_info['unidiff.patch.Line'].diff_line_no,
  271. 'next_commit':
  272. change_line_info.get('previous_commit', None),
  273. 'next_filename':
  274. change_line_info.get('previous_filename', None),
  275. 'Sha_author_timestamp': change_line_info['Sha_author_timestamp'],
  276. 'Sha_committer_timestamp': change_line_info['Sha_committer_timestamp'],
  277. 'last_author_timestamp': change_line_info['commit_author_timestamp'],
  278. 'last_committer_timestamp': change_line_info['commit_committer_timestamp'],
  279. 'next_author_timestamp':
  280. change_line_info.get('previous_commit_author_timestamp', None),
  281. 'next_committer_timestamp':
  282. change_line_info.get('previous_commit_committer_timestamp', None),
  283. })
  284. return lines_data
  285. def process_commits(commits_df: pd.DataFrame, repo_clone_data: dict) -> Tuple[pd.DataFrame, pd.DataFrame]:
  286. """Process commits in the `commits_df` dataframe, augmenting the data
  287. For each commit, compute how many of its post-image lines survived to current
  288. state of the project, and use it to augment per-commit data.
  289. For each of post-image change lines ("added" lines in unified diff), gather
  290. and extract information about its survival, using reverse git blame.
  291. :param pd.DataFrame commits_df: DataFrame with commits sharings from DevGPT
  292. :param dict repo_clone_data: information about cloned project's repositories
  293. :return: tuple of DataFrame augmented with changes survival information,
  294. and DataFrame with information about change lines survival
  295. :rtype: (pd.DataFrame, pd.DataFrame)
  296. TODO: replace tuple with named tuple for return value
  297. """
  298. commits_df.rename(columns={'ModelGPT3.5': 'ModelGPT3_5'}, inplace=True)
  299. repo_cache = {}
  300. total_stats = Counter({
  301. 'n_skipped': 0,
  302. 'n_missing_sha': 0,
  303. 'n_missing_commit': 0,
  304. 'n_errors': 0,
  305. 'n_unmerged': 0,
  306. 'lines_survived_sum': 0,
  307. 'lines_total_sum': 0,
  308. })
  309. augment_data = []
  310. lines_data = []
  311. all_blamed_commits_info = defaultdict(dict)
  312. for row in tqdm(commits_df.itertuples(index=False, name='GptCommit'), desc='commit'):
  313. url = row.URL
  314. project_name = row.RepoName
  315. gpt_commit = row.Sha
  316. repository_path = reponame_to_repo_path(repo_clone_data, project_name)
  317. if repository_path is None:
  318. total_stats['n_skipped'] += 1
  319. if pd.isna(gpt_commit):
  320. total_stats['n_missing_sha'] += 1
  321. if repository_path is None or pd.isna(gpt_commit):
  322. continue
  323. repo = repo_cache.get(project_name, None)
  324. if repo is None:
  325. # call only if needed
  326. repo = GitRepo(repository_path)
  327. # remember for re-use
  328. repo_cache[project_name] = repo
  329. gpt_commit_is_valid = repo.is_valid_commit(gpt_commit)
  330. if not gpt_commit_is_valid:
  331. total_stats['n_missing_commit'] += 1
  332. augment_data.append({
  333. 'Sha': gpt_commit, # to be used for join
  334. 'Sha_is_valid': False,
  335. })
  336. continue
  337. augment_curr, survival_info, blamed_commits_info \
  338. = process_single_commit(repo, project_name, gpt_commit, total_stats)
  339. augment_data.append(augment_curr)
  340. if blamed_commits_info is not None:
  341. all_blamed_commits_info[project_name].update(blamed_commits_info)
  342. if survival_info is not None:
  343. commit_lines_data = process_commit_changed_lines(repo, url, project_name, gpt_commit,
  344. survival_info, all_blamed_commits_info[project_name],
  345. total_stats)
  346. lines_data.extend(commit_lines_data)
  347. if total_stats['n_skipped'] > 0:
  348. print(f"Skipped {total_stats['n_skipped']} rows because repo was not cloned",
  349. file=sys.stderr)
  350. if total_stats['n_missing_sha'] > 0:
  351. print(f"Skipped {total_stats['n_missing_sha']} rows because of missing/NA 'Sha'",
  352. file=sys.stderr)
  353. if total_stats['n_errors'] > 0:
  354. print(f"Skipped {total_stats['n_errors']} rows because of an error",
  355. file=sys.stderr)
  356. if total_stats['n_missing_commit'] > 0:
  357. print(f"There were {total_stats['n_missing_commit']} commits not found in their repo",
  358. file=sys.stderr)
  359. if total_stats['n_unmerged'] > 0:
  360. print(f"There were {total_stats['n_unmerged']} commits not merged into HEAD",
  361. file=sys.stderr)
  362. print(f"Created {len(repo_cache)} of GitRepo objects", file=sys.stderr)
  363. print("Lines survival stats:", file=sys.stderr)
  364. if total_stats['lines_total_sum'] > 0:
  365. print(" "
  366. f"{total_stats['lines_survived_sum']} / {total_stats['lines_total_sum']} = "
  367. f"{100.0 * total_stats['lines_survived_sum'] / total_stats['lines_total_sum']:.2f}% lines survived; "
  368. f"{total_stats['lines_total_sum'] - total_stats['lines_survived_sum']} did not",
  369. file=sys.stderr)
  370. else:
  371. print(f"WARNING: captured {total_stats['lines_total_sum']} changed lines "
  372. f"and {total_stats['lines_survived_sum']} surviving lines",
  373. file=sys.stderr)
  374. # TODO: reduce code duplication
  375. print(" "
  376. f"orig commit metadata: {total_stats['Sha_metadata_from_blame']:6d} from blame, "
  377. f"{total_stats['Sha_metadata_from_repo']:5d} from repo = "
  378. f"{total_stats['Sha_metadata_from_blame'] + total_stats['Sha_metadata_from_repo']:6d} total",
  379. file=sys.stderr)
  380. print(" "
  381. f"last commit metadata: {total_stats['commit_metadata_from_blame']:6d} from blame, "
  382. f"{total_stats['commit_metadata_from_repo']:5d} from repo = "
  383. f"{total_stats['commit_metadata_from_blame'] + total_stats['commit_metadata_from_repo']:6d} total",
  384. file=sys.stderr)
  385. print(" "
  386. f"next commit metadata: {total_stats['previous_commit_metadata_from_blame']:6d} from blame, "
  387. f"{total_stats['previous_commit_metadata_from_repo']:5d} from repo = "
  388. f"{total_stats['previous_commit_metadata_from_blame'] + total_stats['previous_commit_metadata_from_repo']:6d} total",
  389. file=sys.stderr)
  390. print(" ",
  391. total_stats['Sha_metadata_from_repo'] +
  392. total_stats['commit_metadata_from_repo'] +
  393. total_stats['previous_commit_metadata_from_repo'],
  394. "from repo total")
  395. print(f"Creating dataframe with augmentation data from {len(augment_data)} records...",
  396. file=sys.stderr)
  397. augment_df = pd.DataFrame.from_records(augment_data)
  398. print(f"Creating dataframe with line survival data from {len(lines_data)} records...",
  399. file=sys.stderr)
  400. lines_df = pd.DataFrame.from_records(lines_data)
  401. print(f"Merging {commits_df.shape} with {augment_df.shape} dataframes on 'Sha'...", file=sys.stderr)
  402. return pd.merge(commits_df, augment_df, on='Sha', sort=False), lines_df
  403. @timed
  404. def main():
  405. # handle command line parameters
  406. # {script_name} <commit_sharings_df> <repositories.json> <output_commit_df> <output_lines_df>
  407. if len(sys.argv) != 4 + 1: # sys.argv[0] is script name
  408. print(__doc__.format(script_name=sys.argv[0]))
  409. sys.exit(ERROR_ARGS)
  410. commit_sharings_path = Path(sys.argv[1])
  411. repositories_info_path = Path(sys.argv[2])
  412. output_commit_file_path = Path(sys.argv[3])
  413. output_lines_file_path = Path(sys.argv[4])
  414. # ensure that directory/directories leading to output_*_file_path exists
  415. output_commit_file_path.parent.mkdir(parents=True, exist_ok=True)
  416. output_lines_file_path.parent.mkdir(parents=True, exist_ok=True)
  417. # .......................................................................
  418. # PROCESSING
  419. print(f"Reading commit sharings data from '{commit_sharings_path}'...",
  420. file=sys.stderr)
  421. commits_df = pd.read_csv(commit_sharings_path)
  422. repo_clone_data = load_repositories_json(repositories_info_path)
  423. print(f"Processing {commits_df.shape} commit sharings data...",
  424. file=sys.stderr)
  425. augmented_df, lines_df = process_commits(commits_df, repo_clone_data)
  426. print(f"Writing {augmented_df.shape} of augmented commit sharings data\n"
  427. f" to '{output_commit_file_path}'", file=sys.stderr)
  428. augmented_df.to_csv(output_commit_file_path, index=False)
  429. print(f"Writing {lines_df.shape} of changed lines survival data\n"
  430. f" to '{output_lines_file_path}'", file=sys.stderr)
  431. lines_df.to_csv(output_lines_file_path, index=False)
  432. if __name__ == '__main__':
  433. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...