Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

word2vec.py 3.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
  1. import logging
  2. import re
  3. import gensim.models
  4. import pandas as pd
  5. from gensim.models.callbacks import CallbackAny2Vec
  6. from typing import List, Callable, Iterable
  7. from mlutil.text import code_tokenization
  8. from github_search import paperswithcode_tasks
  9. def clean_whitespaces(s):
  10. return re.sub(r"\s+", " ", s)
  11. def default_tokenize_fn(text):
  12. return text.split()
  13. def get_sentences(
  14. dfs: List[pd.DataFrame],
  15. text_cols: List[str],
  16. tokenize_fn: Callable[[str], Iterable[str]],
  17. max_length: int = 1000,
  18. ):
  19. text_series = (
  20. df[text_col].dropna().apply(tokenize_fn)
  21. for (df, text_col) in zip(dfs, text_cols)
  22. )
  23. return (list(sent)[:max_length] for texts in text_series for sent in texts)
  24. class LossLogger(CallbackAny2Vec):
  25. """Output loss at each epoch"""
  26. def __init__(self):
  27. self.epoch = 1
  28. self.losses = []
  29. def on_epoch_begin(self, model):
  30. print(f"Epoch: {self.epoch}", end="\t")
  31. def on_epoch_end(self, model):
  32. loss = model.get_latest_training_loss()
  33. self.losses.append(loss)
  34. print(f" Loss: {loss}")
  35. self.epoch += 1
  36. class LossCallback(CallbackAny2Vec):
  37. """
  38. Callback to print loss after each epoch
  39. """
  40. def __init__(self):
  41. self.epoch = 0
  42. def on_epoch_end(self, model):
  43. loss = model.get_latest_training_loss()
  44. if self.epoch == 0:
  45. print("Loss after epoch {}: {}".format(self.epoch, loss))
  46. else:
  47. print(
  48. "Loss after epoch {}: {}".format(
  49. self.epoch, loss - self.loss_previous_step
  50. )
  51. )
  52. self.epoch += 1
  53. self.loss_previous_step = loss
  54. def make_w2v_model(sentences, embedding_dim=200):
  55. w2v_model = gensim.models.Word2Vec(
  56. size=embedding_dim,
  57. window=5,
  58. min_count=5,
  59. workers=24,
  60. callbacks=[LossCallback()],
  61. )
  62. w2v_model.build_vocab(sentences, progress_per=1000)
  63. return w2v_model
  64. def train_word2vec(
  65. dfs,
  66. text_cols,
  67. epochs,
  68. embedding_dim,
  69. tokenize_fn: Callable[[str], Iterable[str]] = default_tokenize_fn,
  70. ):
  71. sentences = get_sentences(dfs, text_cols, tokenize_fn=tokenize_fn)
  72. if epochs > 1:
  73. sentences = list(sentences)
  74. w2v_model = make_w2v_model(sentences, embedding_dim)
  75. if epochs == 1:
  76. sentences = get_sentences(dfs, text_cols, tokenize_fn=tokenize_fn)
  77. w2v_model.train(
  78. sentences,
  79. total_examples=w2v_model.corpus_count,
  80. epochs=epochs,
  81. report_delay=1,
  82. compute_loss=True,
  83. )
  84. return w2v_model
  85. def save_w2v_model(w2v_model, bin_path, word2vec_path):
  86. if bin_path is not None:
  87. w2v_model.save(bin_path)
  88. if word2vec_path is not None:
  89. w2v_model.wv.save_word2vec_format(word2vec_path)
  90. def train_abstract_readme_w2v(embedding_dim, epochs, upstream, product):
  91. paperswithcode_df, all_papers_df = paperswithcode_tasks.get_paperswithcode_dfs()
  92. papers_with_readmes_df = pd.read_csv(upstream["make_readmes"])
  93. word2vec_model = train_word2vec(
  94. [all_papers_df, papers_with_readmes_df],
  95. ["abstract", "readme"],
  96. epochs,
  97. embedding_dim,
  98. )
  99. save_w2v_model(word2vec_model, str(product["binary"]), str(product["txt"]))
  100. def train_python_code_w2v(python_file_path, embedding_dim, product):
  101. python_code_df = pd.read_feather(
  102. python_file_path.replace("parquet", "feather"), columns=["content"]
  103. )
  104. word2vec_model = train_word2vec(
  105. [python_code_df],
  106. ["content"],
  107. 1,
  108. embedding_dim,
  109. tokenize_fn=code_tokenization.tokenize_python_code,
  110. )
  111. save_w2v_model(word2vec_model, str(product["binary"]), str(product["txt"]))
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...