Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

embedding.py 2.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
  1. import numpy as np
  2. import pandas as pd
  3. import spacy
  4. import pickle
  5. from tqdm import tqdm
  6. from config import *
  7. nlp = spacy.load('en_core_web_sm')
  8. print('Word-embedding training data: ')
  9. # keep 'null' as string, instead of being recognized as null value
  10. words_train = pd.read_csv(PROCESSED_TRAIN_PATH, na_filter=False)
  11. # pick random context-target pairs for embedding
  12. sampled_index_train = np.random.choice(range(len(words_train)), size=TRAIN_SET_SIZE, replace=False)
  13. sampled_train = words_train.iloc[sampled_index_train, :].copy()
  14. context_train = sampled_train.drop(columns='target').values
  15. target_train = sampled_train['target'].values
  16. # word embedding using spacy pretrained word embedding model
  17. embedded_context_train = np.zeros((len(context_train), CONTEXT_WINDOW, 96), dtype=np.float32)
  18. embedded_target_train = np.zeros((len(target_train), 96), dtype=np.float32)
  19. for i in tqdm(range(len(context_train))):
  20. each_words = context_train[i]
  21. for j, each_word in enumerate(each_words):
  22. embedded_context_train[i, j, :] = nlp(each_word).vector
  23. embedded_target_train[i, :] = nlp(target_train[i]).vector
  24. print()
  25. print(f' [DEBUG] Shape of embedded context words of training set: {embedded_context_train.shape}')
  26. print(f' [DEBUG] Shape of embedded target words of training set: {embedded_target_train.shape}')
  27. # store embedded data
  28. with open(EMBEDDED_CONTEXT_TRAIN_PATH, 'wb') as f:
  29. pickle.dump(embedded_context_train, f)
  30. with open(EMBEDDED_TARGET_TRAIN_PATH, 'wb') as f:
  31. pickle.dump(embedded_target_train, f)
  32. print()
  33. print('Word-embedding test data: ')
  34. words_test = pd.read_csv(PROCESSED_TEST_PATH, na_filter=False)
  35. sampled_index_test = np.random.choice(range(len(words_test)), size=TEST_SET_SIZE, replace=False)
  36. sampled_test = words_test.iloc[sampled_index_test, :].copy()
  37. context_test = sampled_test.drop(columns='target').values
  38. target_test = sampled_test['target'].values
  39. embedded_context_test = np.zeros((len(context_test), CONTEXT_WINDOW, 96), dtype=np.float32)
  40. embedded_target_test = np.zeros((len(target_test), 96), dtype=np.float32)
  41. for i in tqdm(range(len(context_test))):
  42. each_words = context_test[i]
  43. for j, each_word in enumerate(each_words):
  44. embedded_context_test[i, j, :] = nlp(each_word).vector
  45. embedded_target_test[i, :] = nlp(target_test[i]).vector
  46. print()
  47. print(f' [DEBUG] Shape of embedded context words of test set: {embedded_context_test.shape}')
  48. print(f' [DEBUG] Shape of embedded target words of test set: {embedded_target_test.shape}')
  49. # store embedded data
  50. with open(EMBEDDED_CONTEXT_TEST_PATH, 'wb') as f:
  51. pickle.dump(embedded_context_test, f)
  52. with open(EMBEDDED_TARGET_TEST_PATH, 'wb') as f:
  53. pickle.dump(embedded_target_test, f)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...