Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

文本分类_lstm_subword.py 4.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
  1. # -*- coding: utf-8 -*-
  2. """文本分类-LSTM-subword.ipynb
  3. Automatically generated by Colaboratory.
  4. Original file is located at
  5. https://colab.research.google.com/drive/1yAuE8tx8tAEJ8auF6K5v95AnoDWvnApY
  6. """
  7. import matplotlib as mpl
  8. import matplotlib.pyplot as plt
  9. import numpy as np
  10. import pandas as pd
  11. import tensorflow as tf
  12. from tensorflow import keras
  13. import sklearn
  14. import os
  15. import sys
  16. import time
  17. # 加载数据集特别常用
  18. import tensorflow_datasets as tfds
  19. print(tf.__version__)
  20. print(sys.version_info)
  21. for module in mpl,np,pd,sklearn,tf,keras,tfds:
  22. print(module.__name__,module.__version__)
  23. """https://tensorflow.google.cn/datasets/catalog/overview
  24. 好多数据集:音频、图片、问答、文本、翻译、视频、可视化
  25. """
  26. # 影评分类
  27. # 下载subword数据集
  28. # with_info:返回元组(tf.data.Dataset,tfds.core.DatasetInfo)
  29. # as_supervised True:有监督的,会把labels返回 False:无监督的,不会把labels返回
  30. # info:subword形成的集合
  31. dataset,info=tfds.load('imdb_reviews/subwords8k',with_info=True,as_supervised=True)
  32. train_dataset,test_dataset=dataset['train'],dataset['test']
  33. # 看看输入、输出
  34. print(train_dataset)
  35. print(test_dataset)
  36. """输入是(None,)
  37. 输出是()
  38. """
  39. train_dataset = train_dataset.map(lambda x_text, x_label: (x_text, tf.expand_dims(x_label, -1)))
  40. test_dataset = test_dataset.map(lambda x_text, x_label: (x_text, tf.expand_dims(x_label, -1)))
  41. # encoder:把文本转成subword形式
  42. # tokenizer对象
  43. tokenizer=info.features['text'].encoder
  44. print(type(tokenizer))
  45. # 看看词袋里面有哪些单词
  46. print('vocabulary size:{}'.format(tokenizer.vocab_size))
  47. # 从训练集中拿出一个,看看有哪些词根subword
  48. for i in train_dataset.take(1):
  49. print(i)
  50. # 对于随便一个句子,看看它在词袋中的id
  51. sample_string="Tensorflow is cool."
  52. # encode():把文本变为subword的id序列
  53. tokenized_string=tokenizer.encode(sample_string)
  54. print("Tokenized string is {}".format(tokenized_string))
  55. # decode():把subword的id序列变为文本
  56. original_string=tokenizer.decode(tokenized_string)
  57. print("Original string is {}".format(original_string))
  58. assert original_string==sample_string
  59. # 看看这个例子中的每个subword的id
  60. for token in tokenized_string:
  61. print("{}—>{} len:{}".format(token,tokenizer.decode([token]),len(tokenizer.decode([token]))))
  62. """空格也有id"""
  63. # 获取shape
  64. buffer_size=10000
  65. batch_size=64
  66. padded_shapes=tf.compat.v1.data.get_output_shapes(train_dataset)
  67. print(padded_shapes)
  68. padded_shapes_test=tf.compat.v1.data.get_output_shapes(test_dataset)
  69. print(padded_shapes_test)
  70. train_dataset=train_dataset.shuffle(buffer_size)
  71. print(train_dataset)
  72. # padded_batch()对每批数据做padding
  73. train_dataset_=train_dataset.padded_batch(batch_size,padded_shapes)
  74. test_dataset=test_dataset.padded_batch(batch_size,padded_shapes_test)
  75. print(train_dataset)
  76. print(test_dataset)
  77. """batch之后维度增加了
  78. """
  79. vocab_size=tokenizer.vocab_size
  80. embedding_dim=16
  81. batch_size=512
  82. # 双向单层LSTM
  83. bi_lstm_model=keras.models.Sequential([
  84. keras.layers.Embedding(vocab_size,embedding_dim),
  85. keras.layers.Bidirectional(keras.layers.LSTM(units=32,return_sequences=False)),
  86. keras.layers.Dense(32,activation='relu'),
  87. keras.layers.Dense(1,activation='sigmoid')
  88. ])
  89. bi_lstm_model.summary()
  90. bi_lstm_model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
  91. """subword 词袋大小:8185
  92. 8185x16=130960
  93. """
  94. history=bi_lstm_model.fit(train_dataset,epochs=10,validation_data=test_dataset)
  95. def plot_learning_curves(history,label,epochs,min_value,max_value):
  96. data={}
  97. data[label]=history.history[label]
  98. data['val_'+label]=history.history['val_'+label]
  99. pd.DataFrame(data).plot(figsize=(8,5))
  100. plt.grid(False)
  101. plt.axis([0,epochs,min_value,max_value])
  102. plt.show()
  103. plot_learning_curves(history,'accuracy',10,0,1)
  104. plot_learning_curves(history,'loss',10,0,1)
  105. """在验证集上:accuracy效果好,loss也没有过拟合,subword-level效果最好啊!!!"""
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...