1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
|
- import numpy as np
- import pandas as pd
- import spacy
- import pickle
- from tqdm import tqdm
- from config import *
- nlp = spacy.load('en_core_web_sm')
- print('Word-embedding training data: ')
- # keep 'null' as string, instead of being recognized as null value
- words_train = pd.read_csv(PROCESSED_TRAIN_PATH, na_filter=False)
- # pick random context-target pairs for embedding
- sampled_index_train = np.random.choice(range(len(words_train)), size=TRAIN_SET_SIZE, replace=False)
- sampled_train = words_train.iloc[sampled_index_train, :].copy()
- context_train = sampled_train.drop(columns='target').values
- target_train = sampled_train['target'].values
- # word embedding using spacy pretrained word embedding model
- embedded_context_train = np.zeros((len(context_train), CONTEXT_WINDOW, 96), dtype=np.float32)
- embedded_target_train = np.zeros((len(target_train), 96), dtype=np.float32)
- for i in tqdm(range(len(context_train))):
- each_words = context_train[i]
- for j, each_word in enumerate(each_words):
- embedded_context_train[i, j, :] = nlp(each_word).vector
- embedded_target_train[i, :] = nlp(target_train[i]).vector
- print()
- print(f' [DEBUG] Shape of embedded context words of training set: {embedded_context_train.shape}')
- print(f' [DEBUG] Shape of embedded target words of training set: {embedded_target_train.shape}')
- # store embedded data
- with open(EMBEDDED_CONTEXT_TRAIN_PATH, 'wb') as f:
- pickle.dump(embedded_context_train, f)
- with open(EMBEDDED_TARGET_TRAIN_PATH, 'wb') as f:
- pickle.dump(embedded_target_train, f)
- print()
- print('Word-embedding test data: ')
- words_test = pd.read_csv(PROCESSED_TEST_PATH, na_filter=False)
- sampled_index_test = np.random.choice(range(len(words_test)), size=TEST_SET_SIZE, replace=False)
- sampled_test = words_test.iloc[sampled_index_test, :].copy()
- context_test = sampled_test.drop(columns='target').values
- target_test = sampled_test['target'].values
- embedded_context_test = np.zeros((len(context_test), CONTEXT_WINDOW, 96), dtype=np.float32)
- embedded_target_test = np.zeros((len(target_test), 96), dtype=np.float32)
- for i in tqdm(range(len(context_test))):
- each_words = context_test[i]
- for j, each_word in enumerate(each_words):
- embedded_context_test[i, j, :] = nlp(each_word).vector
- embedded_target_test[i, :] = nlp(target_test[i]).vector
- print()
- print(f' [DEBUG] Shape of embedded context words of test set: {embedded_context_test.shape}')
- print(f' [DEBUG] Shape of embedded target words of test set: {embedded_target_test.shape}')
- # store embedded data
- with open(EMBEDDED_CONTEXT_TEST_PATH, 'wb') as f:
- pickle.dump(embedded_context_test, f)
- with open(EMBEDDED_TARGET_TEST_PATH, 'wb') as f:
- pickle.dump(embedded_target_test, f)
|