levin
/
nl2ml
mirror of https://gitlab.com/lambda-hse/nl2ml


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
            from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering
from sklearn.manifold import TSNE

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics.cluster import completeness_score
import dagshub

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("LABELED_DATA", help="version of the graph you want regex to label your CSV with", type=str)
args = parser.parse_args()

LABELED_DATA = args.LABELED_DATA
df = pd.read_csv(LABELED_DATA)

X = df['code_block']
y = df['graph_vertex_id']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
number_of_vertices = y.max()

tfidf_vectorizer = TfidfVectorizer()
tfidf_Xtrain = tfidf_vectorizer.fit_transform(X_train)
tfidf_Xtest = tfidf_vectorizer.transform(X_test)

def find_optimal_clusters(train_data, target_data, model, n_clusters):
    history = []
    for k in n_clusters:
        clustering = model.set_params(n_clusters=k).fit(train_data)
        history.append(completeness_score(target_data, clustering.labels_))
    model.set_params(n_clusters=n_clusters[np.argmax(history)])
    print("For model " + str(model))
    print("optimal number of clusters: " + str(n_clusters[np.argmax(history)]))
    print("with score: " +str(np.max(history)))
    print("-------\n")
    return n_clusters[np.argmax(history)], np.max(history)

def experiments(models, train_data, test_data, target_train, target_test, n_clusters):
    opt_clusts = []
    for data, model in zip(train_data, models):
        opt_clust, best_acc = find_optimal_clusters(data, target_train, model, n_clusters)
        opt_clusts.append(opt_clust)
    print("Accuracy on test data")
    best_on_test = []
    for data, model, params in zip(test_data, models, opt_clusts):
        model.set_params(n_clusters=params)
        print("For model " + str(model))
        acc = 0
        if isinstance(model, AgglomerativeClustering):
            acc = (completeness_score(target_test, model.fit_predict(data)))
        else:
            acc = (completeness_score(target_test, model.predict(data)))
        print(acc)
        best_on_test.append(acc)
        print("-------\n")
    return best_on_test, opt_clusts

models = [AgglomerativeClustering(), KMeans(), MiniBatchKMeans()]
train_data = [tfidf_Xtrain.toarray(), tfidf_Xtrain, tfidf_Xtrain]
test_data = [tfidf_Xtest.toarray(), tfidf_Xtest, tfidf_Xtest]
opt_clusts = []
print("For number of clusters from 2 to 100")
experiments(models, train_data, test_data, y_train, y_test, range(2, 101, 2))

print("-------\n\nFor number of clusters around real number of vertices")
best_on_test, optimal_clusters = experiments(models, train_data, 
                                             test_data, y_train, y_test, 
                                             range(number_of_vertices - 5, number_of_vertices + 20, 2))

data_meta = {'DATASET_PATH': LABELED_DATA
                ,'nrows': X.shape[0]
                ,'label': ['-']
                ,'model': ['-']
                ,'script_dir': __file__}
metric_resuts = {'completeness_score_AgglomerativeClustering': best_on_test[0], 
                  'completeness_score_KMeans_results': best_on_test[1], 
                  'completeness_score_MiniBatchKMeans': best_on_test[2]}

AgglomerativeClustering_params = {'completeness_score': optimal_clusters[0]}
KMeans_params = {'completeness_score': optimal_clusters[1]}
MiniBatchKMeans_params = {'completeness_score': optimal_clusters[2]}

with dagshub.dagshub_logger() as logger:
        print("saving the results..")
        logger.log_hyperparams(data_meta)
        logger.log_hyperparams(AgglomerativeClustering_params)
        logger.log_hyperparams(KMeans_params)
        logger.log_hyperparams(MiniBatchKMeans_params)
        logger.log_metrics(metric_resuts)