ncusi
/
MSR_Challenge_2024
connected to https://github.com/ncusi/MSR_Challenge_2024.git


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
150

	
151

	
152

	
153

	
154

	
155

	
156

	
157

	
158

	
159

	
160

	
161

	
162

	
163

	
164

	
165

	
166

	
167

	
168

	
169

	
170

	
171

	
172

	
173

	
174

	
175

	
176

	
177

	
178

	
179

	
180

	
181

	
182

	
183

	
184

	
185

	
186

	
187

	
188

	
189

	
190

	
191

	
192

	
193

	
194

	
195

	
196

	
197

	
198

	
199

	
200

	
201

	
202

	
203

	
204

	
205

	
206

	
207

	
208

	
            #!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Usage: {script_name} <dataset_path> <repositories_path> <output.json>

Saves all git repositories to specific directory, skips already present ones.
Creates <output.json> with information about cloned repositories, which can
be used in those later stage of pipeline that need access to Git repos.

Example:
    python scripts/data/download_repositories.py data/DevGPT/ \\
        data/repositories data/repositories_download_status.json
"""
import json
import sys
from pathlib import Path

import pandas as pd
from tqdm import tqdm

from src.data.sharings import find_sharings_files
from src.utils.functools import timed
from src.utils.git import GitRepo

# constants
ERROR_ARGS = 1

PRETTY_PRINT_OUTPUT = True


def download_repository(repository, repositories_path, verbose=True):
    """Clones git repository `repository` into `repositories_path`

    Assumes existing directories are already cloned repositories,
    ignores all problems.

    :param str repository: string in format "owner/repository"
    :param Path repositories_path: path to directory where to download repositories
    :param bool verbose: whether to print non-error debugging-like information
    :return: dictionary with information about cloned repository, or None on failure
    :rtype: dict or None
    """
    project_name = repository.split('/')[1]
    repository_name = '_'.join(repository.split('/'))
    repository_url = 'https://github.com/' + repository + '.git'
    repository_dir = repositories_path / repository_name

    # check if repository was already cloned
    if repository_dir.is_dir():
        if verbose:
            print(f"Repository already exists: {repository_name}")

        return {
            'project': repository_name,
            'repository': repository,
            'repository_url': repository_url,
            'repository_path': str(repository_dir)
        }
    elif repository_dir.exists():
        print(f"Could not clone repository for {repository_name} "
              f"because '{repository_dir}' is in the way")
        return None

    # upgrade from legacy storage, where repository was stored under `project_name`
    legacy_dir = repositories_path / project_name
    if legacy_dir.is_symlink():
        # legacy dir already processed
        reference_dir = legacy_dir.readlink()
        if not reference_dir.is_absolute():
            reference_dir = repository_dir / reference_dir
        # there is possibility that this clone is not needed, if referenced URL == our URL
        repo = GitRepo.clone_repository(repository_url, repository_dir,
                                        reference_local_repository=reference_dir)
        if repo is None:
            print(f"Could not clone {repository_name} at {repository_url}\n"
                  f"using local '{reference_dir}' as reference")
            return None

        return {
            'project': repository_name,
            'repository': repository,
            'repository_url': repository_url,
            'repository_path': str(repo),
            'reference_repository_path': str(reference_dir),
        }
    if legacy_dir.is_dir():
        # upgrade legacy storage
        origin_url = GitRepo(legacy_dir).get_config('remote.origin.url')
        if repository_url.startswith(origin_url):  # take into account repo.git vs repo
            # at this point we know that `repository_dir` does not exist
            legacy_dir.replace(repository_dir)
            legacy_dir.symlink_to(repository_dir, target_is_directory=True)

            return {
                'project': repository_name,
                'repository': repository,
                'repository_url': repository_url,
                'repository_path': str(repository_dir),
                'alternative_path': str(legacy_dir),
            }
        else:
            # we can use it as reference, but need to dissociate
            # because the symlink can be removed
            repo = GitRepo.clone_repository(repository_url, repository_dir,
                                            reference_local_repository=legacy_dir,
                                            dissociate=True)
            if repo is None:
                print(f"Could not clone {repository_name} at {repository_url}\n"
                      f"using local '{legacy_dir}' as temporary reference")
                return None

            return {
                'project': repository_name,
                'repository': repository,
                'repository_url': repository_url,
                'repository_path': str(repo),
                'disassociated_reference_repository_path': str(legacy_dir),
            }

    try:
        repo = GitRepo.clone_repository(repository_url, repository_dir)
        if repo is None:
            print(f"Could not clone {repository_name} at {repository_url}")
            return None

        return {
            'project': repository_name,
            'repository': repository,
            'repository_url': repository_url,
            'repository_path': str(repo)
        }
    except Exception as ex:
        print(f"Could not clone repository for {repository_name} at {repository_url}: {ex}")
        return None


def download_repositories(repositories, repositories_path):
    """Clones all repositories into `repositories_path`

    :param list[str] repositories: list of repository names
    :param Path repositories_path: path to save data
    :return: information about successfully cloned repositories
    :rtype: list[dict]
    """
    result = []
    for repository in tqdm(repositories, desc='repositories'):
        repo_info = download_repository(repository, repositories_path,
                                        verbose=False)
        if repo_info is not None:
            result.append(repo_info)

    return result


def load_sharings(sharings_path):
    with open(sharings_path) as sharings_file:
        sharings = json.load(sharings_file)
    df = pd.DataFrame.from_records(sharings['Sources'])
    return df


def combine_sharings(sharings_paths):
    dfs = []
    for sharing_path in sharings_paths:
        dfs.append(load_sharings(sharing_path))
    df = pd.concat(dfs)
    return df


@timed
def main():
    # handle command line parameters
    # {script_name} <dataset_path> <repositories_path> <output.json>
    if len(sys.argv) != 3 + 1:  # sys.argv[0] is script name
        print(__doc__.format(script_name=sys.argv[0]))
        sys.exit(ERROR_ARGS)

    dataset_directory_path = Path(sys.argv[1])
    repositories_path = Path(sys.argv[2])
    output_file_path = Path(sys.argv[3])

    print(f"Reading data about dataset from '{dataset_directory_path}'...", file=sys.stderr)

    commit_sharings_paths, issue_sharings_paths, pr_sharings_paths = find_sharings_files(dataset_directory_path)

    commit_df = combine_sharings(commit_sharings_paths)
    issue_df = combine_sharings(issue_sharings_paths)
    pr_df = combine_sharings(pr_sharings_paths)

    commit_repositories = list(commit_df['RepoName'].unique())
    issue_repositories = list(issue_df['RepoName'].unique())
    pr_repositories = list(pr_df['RepoName'].unique())

    repositories = list(set(commit_repositories + issue_repositories + pr_repositories))

    print(f"Cloning {len(repositories)} repositories into '{repositories_path}'...", file=sys.stderr)
    cloned_data = download_repositories(repositories, repositories_path)

    output_file_path.parent.mkdir(parents=True, exist_ok=True)
    print(f"Writing output data to '{output_file_path}'...", file=sys.stderr)
    with open(output_file_path, 'w') as output_file:
        if PRETTY_PRINT_OUTPUT:
            json.dump(cloned_data, output_file, indent=2)
        else:
            json.dump(cloned_data, output_file)


if __name__ == '__main__':
    main()