Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

download_repositories.py 7.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """Usage: {script_name} <dataset_path> <repositories_path> <output.json>
  4. Saves all git repositories to specific directory, skips already present ones.
  5. Creates <output.json> with information about cloned repositories, which can
  6. be used in those later stage of pipeline that need access to Git repos.
  7. Example:
  8. python scripts/data/download_repositories.py data/DevGPT/ \\
  9. data/repositories data/repositories_download_status.json
  10. """
  11. import json
  12. import sys
  13. from pathlib import Path
  14. import pandas as pd
  15. from tqdm import tqdm
  16. from src.data.sharings import find_sharings_files
  17. from src.utils.functools import timed
  18. from src.utils.git import GitRepo
  19. # constants
  20. ERROR_ARGS = 1
  21. PRETTY_PRINT_OUTPUT = True
  22. def download_repository(repository, repositories_path, verbose=True):
  23. """Clones git repository `repository` into `repositories_path`
  24. Assumes existing directories are already cloned repositories,
  25. ignores all problems.
  26. :param str repository: string in format "owner/repository"
  27. :param Path repositories_path: path to directory where to download repositories
  28. :param bool verbose: whether to print non-error debugging-like information
  29. :return: dictionary with information about cloned repository, or None on failure
  30. :rtype: dict or None
  31. """
  32. project_name = repository.split('/')[1]
  33. repository_name = '_'.join(repository.split('/'))
  34. repository_url = 'https://github.com/' + repository + '.git'
  35. repository_dir = repositories_path / repository_name
  36. # check if repository was already cloned
  37. if repository_dir.is_dir():
  38. if verbose:
  39. print(f"Repository already exists: {repository_name}")
  40. return {
  41. 'project': repository_name,
  42. 'repository': repository,
  43. 'repository_url': repository_url,
  44. 'repository_path': str(repository_dir)
  45. }
  46. elif repository_dir.exists():
  47. print(f"Could not clone repository for {repository_name} "
  48. f"because '{repository_dir}' is in the way")
  49. return None
  50. # upgrade from legacy storage, where repository was stored under `project_name`
  51. legacy_dir = repositories_path / project_name
  52. if legacy_dir.is_symlink():
  53. # legacy dir already processed
  54. reference_dir = legacy_dir.readlink()
  55. if not reference_dir.is_absolute():
  56. reference_dir = repository_dir / reference_dir
  57. # there is possibility that this clone is not needed, if referenced URL == our URL
  58. repo = GitRepo.clone_repository(repository_url, repository_dir,
  59. reference_local_repository=reference_dir)
  60. if repo is None:
  61. print(f"Could not clone {repository_name} at {repository_url}\n"
  62. f"using local '{reference_dir}' as reference")
  63. return None
  64. return {
  65. 'project': repository_name,
  66. 'repository': repository,
  67. 'repository_url': repository_url,
  68. 'repository_path': str(repo),
  69. 'reference_repository_path': str(reference_dir),
  70. }
  71. if legacy_dir.is_dir():
  72. # upgrade legacy storage
  73. origin_url = GitRepo(legacy_dir).get_config('remote.origin.url')
  74. if repository_url.startswith(origin_url): # take into account repo.git vs repo
  75. # at this point we know that `repository_dir` does not exist
  76. legacy_dir.replace(repository_dir)
  77. legacy_dir.symlink_to(repository_dir, target_is_directory=True)
  78. return {
  79. 'project': repository_name,
  80. 'repository': repository,
  81. 'repository_url': repository_url,
  82. 'repository_path': str(repository_dir),
  83. 'alternative_path': str(legacy_dir),
  84. }
  85. else:
  86. # we can use it as reference, but need to dissociate
  87. # because the symlink can be removed
  88. repo = GitRepo.clone_repository(repository_url, repository_dir,
  89. reference_local_repository=legacy_dir,
  90. dissociate=True)
  91. if repo is None:
  92. print(f"Could not clone {repository_name} at {repository_url}\n"
  93. f"using local '{legacy_dir}' as temporary reference")
  94. return None
  95. return {
  96. 'project': repository_name,
  97. 'repository': repository,
  98. 'repository_url': repository_url,
  99. 'repository_path': str(repo),
  100. 'disassociated_reference_repository_path': str(legacy_dir),
  101. }
  102. try:
  103. repo = GitRepo.clone_repository(repository_url, repository_dir)
  104. if repo is None:
  105. print(f"Could not clone {repository_name} at {repository_url}")
  106. return None
  107. return {
  108. 'project': repository_name,
  109. 'repository': repository,
  110. 'repository_url': repository_url,
  111. 'repository_path': str(repo)
  112. }
  113. except Exception as ex:
  114. print(f"Could not clone repository for {repository_name} at {repository_url}: {ex}")
  115. return None
  116. def download_repositories(repositories, repositories_path):
  117. """Clones all repositories into `repositories_path`
  118. :param list[str] repositories: list of repository names
  119. :param Path repositories_path: path to save data
  120. :return: information about successfully cloned repositories
  121. :rtype: list[dict]
  122. """
  123. result = []
  124. for repository in tqdm(repositories, desc='repositories'):
  125. repo_info = download_repository(repository, repositories_path,
  126. verbose=False)
  127. if repo_info is not None:
  128. result.append(repo_info)
  129. return result
  130. def load_sharings(sharings_path):
  131. with open(sharings_path) as sharings_file:
  132. sharings = json.load(sharings_file)
  133. df = pd.DataFrame.from_records(sharings['Sources'])
  134. return df
  135. def combine_sharings(sharings_paths):
  136. dfs = []
  137. for sharing_path in sharings_paths:
  138. dfs.append(load_sharings(sharing_path))
  139. df = pd.concat(dfs)
  140. return df
  141. @timed
  142. def main():
  143. # handle command line parameters
  144. # {script_name} <dataset_path> <repositories_path> <output.json>
  145. if len(sys.argv) != 3 + 1: # sys.argv[0] is script name
  146. print(__doc__.format(script_name=sys.argv[0]))
  147. sys.exit(ERROR_ARGS)
  148. dataset_directory_path = Path(sys.argv[1])
  149. repositories_path = Path(sys.argv[2])
  150. output_file_path = Path(sys.argv[3])
  151. print(f"Reading data about dataset from '{dataset_directory_path}'...", file=sys.stderr)
  152. commit_sharings_paths, issue_sharings_paths, pr_sharings_paths = find_sharings_files(dataset_directory_path)
  153. commit_df = combine_sharings(commit_sharings_paths)
  154. issue_df = combine_sharings(issue_sharings_paths)
  155. pr_df = combine_sharings(pr_sharings_paths)
  156. commit_repositories = list(commit_df['RepoName'].unique())
  157. issue_repositories = list(issue_df['RepoName'].unique())
  158. pr_repositories = list(pr_df['RepoName'].unique())
  159. repositories = list(set(commit_repositories + issue_repositories + pr_repositories))
  160. print(f"Cloning {len(repositories)} repositories into '{repositories_path}'...", file=sys.stderr)
  161. cloned_data = download_repositories(repositories, repositories_path)
  162. output_file_path.parent.mkdir(parents=True, exist_ok=True)
  163. print(f"Writing output data to '{output_file_path}'...", file=sys.stderr)
  164. with open(output_file_path, 'w') as output_file:
  165. if PRETTY_PRINT_OUTPUT:
  166. json.dump(cloned_data, output_file, indent=2)
  167. else:
  168. json.dump(cloned_data, output_file)
  169. if __name__ == '__main__':
  170. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...