SHENSHENZYC
/
next-word-prediction-with-LSTM


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
            import numpy as np
import pandas as pd
import os

from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from nltk.tokenize import RegexpTokenizer

from config import *


def tokenize(file_path):
    """
    Tokenize the text of a file referred to by file_path. 
    """

    with open(file_path, encoding='latin-1', mode='r') as f:
        text = f.read()
    text = text.lower()
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
    tokens = tokenizer.tokenize(text)
    tokens = [w for w in tokens if len(w) > 2]  # ignore a, an, to, at, be, ...
    tokens = [word for word in tokens if word not in ENGLISH_STOP_WORDS]

    return tokens

print('[DEBUG] Started text preprocessing')
print('--------------------------------')

# load text from training data
train_file_list = []
for root, dirs, files in os.walk(TRAIN_ROOT_PATH):
    # excluding hidden files
    files = [f for f in files if not f[0] == '.']
    dirs[:] = [d for d in dirs if not d[0] == '.']
    for name in files:
        train_file_list.append(os.path.join(root, name))

print('[DEBUG] Obtained text files for training')

# feature engineering
context_train = []
target_train = []
for file_path in train_file_list:
    tokens = tokenize(file_path)
    # unique_tokens.append(np.unique(tokens))

    for i in range(len(tokens) - CONTEXT_WINDOW):
        context_train.append(tokens[i:i + CONTEXT_WINDOW])
        target_train.append(tokens[i + CONTEXT_WINDOW])
    
# create features and target for model training
df_train = pd.DataFrame(context_train, columns=[f'context_{i}' for i in range(CONTEXT_WINDOW)])
df_train['target'] = np.array(target_train)

print('[DEBUG] Done context-target-split in text for training')


# load text from test data
test_file_list = []
for root, dirs, files in os.walk(TEST_ROOT_PATH):
    # excluding hidden files
    files = [f for f in files if not f[0] == '.']
    dirs[:] = [d for d in dirs if not d[0] == '.']
    for name in files:
        test_file_list.append(os.path.join(root, name))

print('[DEBUG] Obtained text files for testing')

# feature engineering
context_test = []
target_test = []
for file_path in test_file_list:
    tokens = tokenize(file_path)

    for i in range(len(tokens) - CONTEXT_WINDOW):
        context_test.append(tokens[i:i + CONTEXT_WINDOW])
        target_test.append([tokens[i + CONTEXT_WINDOW]])
   
df_test = pd.DataFrame(context_test, columns=[f'context_{i}' for i in range(CONTEXT_WINDOW)])
df_test['target'] = np.array(target_test)

print('[DEBUG] Done context-target-split in text for testing')


# store training and test tabular data in csv files
df_train.to_csv(PROCESSED_TRAIN_PATH, index=False)
df_test.to_csv(PROCESSED_TEST_PATH, index=False)

print('[DEBUG] Stored training and test data of context vs target words in csv files')# and unique tokens in pickle file')