Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

clustering_tfidf.py 3.9 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
  1. from sklearn.model_selection import train_test_split, GridSearchCV
  2. from sklearn.feature_extraction.text import TfidfVectorizer
  3. from sklearn.cluster import KMeans, MiniBatchKMeans, AgglomerativeClustering
  4. from sklearn.manifold import TSNE
  5. import pandas as pd
  6. import numpy as np
  7. import matplotlib.pyplot as plt
  8. from sklearn.metrics.cluster import completeness_score
  9. import dagshub
  10. import argparse
  11. parser = argparse.ArgumentParser()
  12. parser.add_argument("LABELED_DATA", help="version of the graph you want regex to label your CSV with", type=str)
  13. args = parser.parse_args()
  14. LABELED_DATA = args.LABELED_DATA
  15. df = pd.read_csv(LABELED_DATA)
  16. X = df['code_block']
  17. y = df['graph_vertex_id']
  18. X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
  19. number_of_vertices = y.max()
  20. tfidf_vectorizer = TfidfVectorizer()
  21. tfidf_Xtrain = tfidf_vectorizer.fit_transform(X_train)
  22. tfidf_Xtest = tfidf_vectorizer.transform(X_test)
  23. def find_optimal_clusters(train_data, target_data, model, n_clusters):
  24. history = []
  25. for k in n_clusters:
  26. clustering = model.set_params(n_clusters=k).fit(train_data)
  27. history.append(completeness_score(target_data, clustering.labels_))
  28. model.set_params(n_clusters=n_clusters[np.argmax(history)])
  29. print("For model " + str(model))
  30. print("optimal number of clusters: " + str(n_clusters[np.argmax(history)]))
  31. print("with score: " +str(np.max(history)))
  32. print("-------\n")
  33. return n_clusters[np.argmax(history)], np.max(history)
  34. def experiments(models, train_data, test_data, target_train, target_test, n_clusters):
  35. opt_clusts = []
  36. for data, model in zip(train_data, models):
  37. opt_clust, best_acc = find_optimal_clusters(data, target_train, model, n_clusters)
  38. opt_clusts.append(opt_clust)
  39. print("Accuracy on test data")
  40. best_on_test = []
  41. for data, model, params in zip(test_data, models, opt_clusts):
  42. model.set_params(n_clusters=params)
  43. print("For model " + str(model))
  44. acc = 0
  45. if isinstance(model, AgglomerativeClustering):
  46. acc = (completeness_score(target_test, model.fit_predict(data)))
  47. else:
  48. acc = (completeness_score(target_test, model.predict(data)))
  49. print(acc)
  50. best_on_test.append(acc)
  51. print("-------\n")
  52. return best_on_test, opt_clusts
  53. models = [AgglomerativeClustering(), KMeans(), MiniBatchKMeans()]
  54. train_data = [tfidf_Xtrain.toarray(), tfidf_Xtrain, tfidf_Xtrain]
  55. test_data = [tfidf_Xtest.toarray(), tfidf_Xtest, tfidf_Xtest]
  56. opt_clusts = []
  57. print("For number of clusters from 2 to 100")
  58. experiments(models, train_data, test_data, y_train, y_test, range(2, 101, 2))
  59. print("-------\n\nFor number of clusters around real number of vertices")
  60. best_on_test, optimal_clusters = experiments(models, train_data,
  61. test_data, y_train, y_test,
  62. range(number_of_vertices - 5, number_of_vertices + 20, 2))
  63. data_meta = {'DATASET_PATH': LABELED_DATA
  64. ,'nrows': X.shape[0]
  65. ,'label': ['-']
  66. ,'model': ['-']
  67. ,'script_dir': __file__}
  68. metric_resuts = {'completeness_score_AgglomerativeClustering': best_on_test[0],
  69. 'completeness_score_KMeans_results': best_on_test[1],
  70. 'completeness_score_MiniBatchKMeans': best_on_test[2]}
  71. AgglomerativeClustering_params = {'completeness_score': optimal_clusters[0]}
  72. KMeans_params = {'completeness_score': optimal_clusters[1]}
  73. MiniBatchKMeans_params = {'completeness_score': optimal_clusters[2]}
  74. with dagshub.dagshub_logger() as logger:
  75. print("saving the results..")
  76. logger.log_hyperparams(data_meta)
  77. logger.log_hyperparams(AgglomerativeClustering_params)
  78. logger.log_hyperparams(KMeans_params)
  79. logger.log_hyperparams(MiniBatchKMeans_params)
  80. logger.log_metrics(metric_resuts)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...