SHENSHENZYC
/
next-word-prediction-with-LSTM


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
            import numpy as np
import pandas as pd
import spacy
import pickle

from tqdm import tqdm

from config import *


nlp = spacy.load('en_core_web_sm')

print('Word-embedding training data: ')

# keep 'null' as string, instead of being recognized as null value
words_train = pd.read_csv(PROCESSED_TRAIN_PATH, na_filter=False)
# pick random context-target pairs for embedding
sampled_index_train = np.random.choice(range(len(words_train)), size=TRAIN_SET_SIZE, replace=False)
sampled_train = words_train.iloc[sampled_index_train, :].copy()

context_train = sampled_train.drop(columns='target').values
target_train = sampled_train['target'].values

# word embedding using spacy pretrained word embedding model
embedded_context_train = np.zeros((len(context_train), CONTEXT_WINDOW, 96), dtype=np.float32)
embedded_target_train = np.zeros((len(target_train), 96), dtype=np.float32)
for i in tqdm(range(len(context_train))):
    each_words = context_train[i]
    for j, each_word in enumerate(each_words):
        embedded_context_train[i, j, :] = nlp(each_word).vector
    embedded_target_train[i, :] = nlp(target_train[i]).vector

print()
print(f'    [DEBUG] Shape of embedded context words of training set: {embedded_context_train.shape}')
print(f'    [DEBUG] Shape of embedded target words of training set: {embedded_target_train.shape}')

# store embedded data
with open(EMBEDDED_CONTEXT_TRAIN_PATH, 'wb') as f:
    pickle.dump(embedded_context_train, f)

with open(EMBEDDED_TARGET_TRAIN_PATH, 'wb') as f:
    pickle.dump(embedded_target_train, f)


print()
print('Word-embedding test data: ')

words_test = pd.read_csv(PROCESSED_TEST_PATH, na_filter=False)
sampled_index_test = np.random.choice(range(len(words_test)), size=TEST_SET_SIZE, replace=False)
sampled_test = words_test.iloc[sampled_index_test, :].copy()

context_test = sampled_test.drop(columns='target').values
target_test = sampled_test['target'].values

embedded_context_test = np.zeros((len(context_test), CONTEXT_WINDOW, 96), dtype=np.float32)
embedded_target_test = np.zeros((len(target_test), 96), dtype=np.float32)
for i in tqdm(range(len(context_test))):
    each_words = context_test[i]
    for j, each_word in enumerate(each_words):
        embedded_context_test[i, j, :] = nlp(each_word).vector
    embedded_target_test[i, :] = nlp(target_test[i]).vector

print()
print(f'    [DEBUG] Shape of embedded context words of test set: {embedded_context_test.shape}')
print(f'    [DEBUG] Shape of embedded target words of test set: {embedded_target_test.shape}')

# store embedded data
with open(EMBEDDED_CONTEXT_TEST_PATH, 'wb') as f:
    pickle.dump(embedded_context_test, f)

with open(EMBEDDED_TARGET_TEST_PATH, 'wb') as f:
    pickle.dump(embedded_target_test, f)