Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

model.py 6.4 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
  1. import numpy as np
  2. import tensorflow as tf
  3. from tensorflow.contrib.training import HParams
  4. def default_hparams():
  5. return HParams(
  6. n_vocab=0,
  7. n_ctx=1024,
  8. n_embd=768,
  9. n_head=12,
  10. n_layer=12,
  11. )
  12. def shape_list(x):
  13. """Deal with dynamic shape in tensorflow cleanly."""
  14. static = x.shape.as_list()
  15. dynamic = tf.shape(x)
  16. return [dynamic[i] if s is None else s for i, s in enumerate(static)]
  17. def softmax(x, axis=-1):
  18. x = x - tf.reduce_max(x, axis=axis, keepdims=True)
  19. ex = tf.exp(x)
  20. return ex / tf.reduce_sum(ex, axis=axis, keepdims=True)
  21. def gelu(x):
  22. return 0.5*x*(1+tf.tanh(np.sqrt(2/np.pi)*(x+0.044715*tf.pow(x, 3))))
  23. def norm(x, scope, *, axis=-1, epsilon=1e-5):
  24. """Normalize to mean = 0, std = 1, then do a diagonal affine transform."""
  25. with tf.variable_scope(scope):
  26. n_state = x.shape[-1].value
  27. g = tf.get_variable('g', [n_state], initializer=tf.constant_initializer(1))
  28. b = tf.get_variable('b', [n_state], initializer=tf.constant_initializer(0))
  29. u = tf.reduce_mean(x, axis=axis, keepdims=True)
  30. s = tf.reduce_mean(tf.square(x-u), axis=axis, keepdims=True)
  31. x = (x - u) * tf.rsqrt(s + epsilon)
  32. x = x*g + b
  33. return x
  34. def split_states(x, n):
  35. """Reshape the last dimension of x into [n, x.shape[-1]/n]."""
  36. *start, m = shape_list(x)
  37. return tf.reshape(x, start + [n, m//n])
  38. def merge_states(x):
  39. """Smash the last two dimensions of x into a single dimension."""
  40. *start, a, b = shape_list(x)
  41. return tf.reshape(x, start + [a*b])
  42. def conv1d(x, scope, nf, *, w_init_stdev=0.02):
  43. with tf.variable_scope(scope):
  44. *start, nx = shape_list(x)
  45. w = tf.get_variable('w', [1, nx, nf], initializer=tf.random_normal_initializer(stddev=w_init_stdev))
  46. b = tf.get_variable('b', [nf], initializer=tf.constant_initializer(0))
  47. c = tf.reshape(tf.matmul(tf.reshape(x, [-1, nx]), tf.reshape(w, [-1, nf]))+b, start+[nf])
  48. return c
  49. def attention_mask(nd, ns, *, dtype):
  50. """1's in the lower triangle, counting from the lower right corner.
  51. Same as tf.matrix_band_part(tf.ones([nd, ns]), -1, ns-nd), but doesn't produce garbage on TPUs.
  52. """
  53. i = tf.range(nd)[:,None]
  54. j = tf.range(ns)
  55. m = i >= j - ns + nd
  56. return tf.cast(m, dtype)
  57. def attn(x, scope, n_state, *, past, hparams):
  58. assert x.shape.ndims == 3 # Should be [batch, sequence, features]
  59. assert n_state % hparams.n_head == 0
  60. if past is not None:
  61. assert past.shape.ndims == 5 # Should be [batch, 2, heads, sequence, features], where 2 is [k, v]
  62. def split_heads(x):
  63. # From [batch, sequence, features] to [batch, heads, sequence, features]
  64. return tf.transpose(split_states(x, hparams.n_head), [0, 2, 1, 3])
  65. def merge_heads(x):
  66. # Reverse of split_heads
  67. return merge_states(tf.transpose(x, [0, 2, 1, 3]))
  68. def mask_attn_weights(w):
  69. # w has shape [batch, heads, dst_sequence, src_sequence], where information flows from src to dst.
  70. _, _, nd, ns = shape_list(w)
  71. b = attention_mask(nd, ns, dtype=w.dtype)
  72. b = tf.reshape(b, [1, 1, nd, ns])
  73. w = w*b - tf.cast(1e10, w.dtype)*(1-b)
  74. return w
  75. def multihead_attn(q, k, v):
  76. # q, k, v have shape [batch, heads, sequence, features]
  77. w = tf.matmul(q, k, transpose_b=True)
  78. w = w * tf.rsqrt(tf.cast(v.shape[-1].value, w.dtype))
  79. w = mask_attn_weights(w)
  80. w = softmax(w)
  81. a = tf.matmul(w, v)
  82. return a
  83. with tf.variable_scope(scope):
  84. c = conv1d(x, 'c_attn', n_state*3)
  85. q, k, v = map(split_heads, tf.split(c, 3, axis=2))
  86. present = tf.stack([k, v], axis=1)
  87. if past is not None:
  88. pk, pv = tf.unstack(past, axis=1)
  89. k = tf.concat([pk, k], axis=-2)
  90. v = tf.concat([pv, v], axis=-2)
  91. a = multihead_attn(q, k, v)
  92. a = merge_heads(a)
  93. a = conv1d(a, 'c_proj', n_state)
  94. return a, present
  95. def mlp(x, scope, n_state, *, hparams):
  96. with tf.variable_scope(scope):
  97. nx = x.shape[-1].value
  98. h = gelu(conv1d(x, 'c_fc', n_state))
  99. h2 = conv1d(h, 'c_proj', nx)
  100. return h2
  101. def block(x, scope, *, past, hparams):
  102. with tf.variable_scope(scope):
  103. nx = x.shape[-1].value
  104. a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
  105. x = x + a
  106. m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
  107. x = x + m
  108. return x, present
  109. def past_shape(*, hparams, batch_size=None, sequence=None):
  110. return [batch_size, hparams.n_layer, 2, hparams.n_head, sequence, hparams.n_embd // hparams.n_head]
  111. def expand_tile(value, size):
  112. """Add a new axis of given size."""
  113. value = tf.convert_to_tensor(value, name='value')
  114. ndims = value.shape.ndims
  115. return tf.tile(tf.expand_dims(value, axis=0), [size] + [1]*ndims)
  116. def positions_for(tokens, past_length):
  117. batch_size = tf.shape(tokens)[0]
  118. nsteps = tf.shape(tokens)[1]
  119. return expand_tile(past_length + tf.range(nsteps), batch_size)
  120. def model(hparams, X, past=None, scope='model', reuse=False):
  121. with tf.variable_scope(scope, reuse=reuse):
  122. results = {}
  123. batch, sequence = shape_list(X)
  124. wpe = tf.get_variable('wpe', [hparams.n_ctx, hparams.n_embd],
  125. initializer=tf.random_normal_initializer(stddev=0.01))
  126. wte = tf.get_variable('wte', [hparams.n_vocab, hparams.n_embd],
  127. initializer=tf.random_normal_initializer(stddev=0.02))
  128. past_length = 0 if past is None else tf.shape(past)[-2]
  129. h = tf.gather(wte, X) + tf.gather(wpe, positions_for(X, past_length))
  130. # Transformer
  131. presents = []
  132. pasts = tf.unstack(past, axis=1) if past is not None else [None] * hparams.n_layer
  133. assert len(pasts) == hparams.n_layer
  134. for layer, past in enumerate(pasts):
  135. h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
  136. presents.append(present)
  137. results['present'] = tf.stack(presents, axis=1)
  138. h = norm(h, 'ln_f')
  139. # Language model loss. Do tokens <n predict token n?
  140. h_flat = tf.reshape(h, [batch*sequence, hparams.n_embd])
  141. logits = tf.matmul(h_flat, wte, transpose_b=True)
  142. logits = tf.reshape(logits, [batch, sequence, hparams.n_vocab])
  143. results['logits'] = logits
  144. return results
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...