1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
- import os
- import json
- import collections
- import re
- from keras.models import model_from_json
- from keras.preprocessing.sequence import pad_sequences
- from python.style_extract import tokenizer
- from tqdm import tqdm
- def load_model(folder):
- """Loads model files, and returns keras model and parameters"""
- with open(folder + '/model_arch.json', 'r', encoding="utf-8", errors="ignore") as f:
- model = model_from_json(f.read())
- model.load_weights(folder + '/model_weights.h5')
- model._make_predict_function()
- with open(folder + '/model_params.json', 'r', encoding="utf-8", errors="ignore") as f:
- data = json.load(f)
- word2ind = collections.defaultdict(lambda: 1, data["word2ind"])
- ind2word = {i: l for l, i in word2ind.items()}
- label2ind = data["label2ind"]
- ind2label = {i: l for l, i in label2ind.items()}
- ind2label[0] = 'n'
- maxlen = data["max_length"]
- return model, {'word2ind': word2ind, 'ind2word': ind2word, 'label2ind': label2ind, 'ind2label': ind2label,
- 'maxlen': maxlen}
- def predict_on_token_array(X, model, params):
- X_enc = [[params['word2ind'][x] for x in X]]
- X_enc = pad_sequences(X_enc, maxlen=params['maxlen'])
- y_enc = model.predict(X_enc).argmax(2)
- y_enc = list(y_enc)[0][-len(X):]
- return [params["ind2label"][y] for y in y_enc]
- def predict_on_test_file(filename, model, params):
- ret = []
- with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
- for line in f.readlines():
- line_prediction = ' '.join(predict_on_token_array(line.split(), model, params))
- ret.append(line_prediction)
- return ret
- def predict_on_test_dir(dirname, model, params):
- ret= {}
- for fname in tqdm(os.listdir(dirname)):
- if fname.endswith('.txt'):
- ret[fname.replace('.txt', '')] = predict_on_test_file(dirname+'/'+fname, model, params)
- return ret
- def autotag(text, model, params):
- """Gets text, model and params, and outputs formatter HTML"""
- # Covert line to X_enc vector, and predict y_enc
- X = [tokenizer(line.strip(), lower=False, enum=False, numeric=False) for line in text.split('\n')]
- X_enc = [[params['word2ind'][tokenizer(c, split=False, enum=True, numeric=True)] for c in x] for x in X]
- X_enc = pad_sequences(X_enc, maxlen=params['maxlen'])
- y_enc = model.predict(X_enc).argmax(2)
- # Turn prediction to HTML
- lines = []
- for row in zip(X, y_enc):
- lines.append([])
- for word, label in zip(reversed(row[0]), reversed(row[1])):
- tag = params['ind2label'][label]
- lines[-1].insert(0, "<{t}>{w}</{t}>".format(t=tag, w=word) if tag != 'n' else word)
- html = "<br>".join([' '.join(line) for line in lines])
- # Rejoin words together, to get a cleaner view
- for tag in params['ind2label'].values():
- html = html.replace("</{t}> <{t}>".format(t=tag), " ")
- html = re.compile('" (\w+) "').sub('"\\1"', html)
- html = re.compile(' ([\\.,:;]) ').sub('\\1 ', html)
- return html
- if __name__ == "__main__":
- model, params = load_model("../model")
- X = "governing law . the parties shall obide to".split()
- y = predict_on_token_array(X, model, params)
- print(list(zip(X, y)))
|