Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

test_noising.py 19 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the license found in the LICENSE file in
  5. # the root directory of this source tree. An additional grant of patent rights
  6. # can be found in the PATENTS file in the same directory.
  7. import unittest
  8. from typing import Dict, List
  9. import tests.utils as test_utils
  10. import torch
  11. from fairseq import utils
  12. from fairseq.data import (
  13. Dictionary,
  14. LanguagePairDataset,
  15. TransformEosDataset,
  16. data_utils,
  17. noising,
  18. )
  19. class TestDataNoising(unittest.TestCase):
  20. def _get_test_data_with_bpe_cont_marker(self, append_eos=True):
  21. """
  22. Args:
  23. append_eos: if True, each input sentence in the source tokens tensor
  24. will have an EOS appended to the end.
  25. Returns:
  26. vocabs: BPE vocab with continuation markers as suffixes to denote
  27. non-end of word tokens. This is the standard BPE format used in
  28. fairseq's preprocessing.
  29. x: input tensor containing numberized source tokens, with EOS at the
  30. end if append_eos is true
  31. src_lengths: and source lengths.
  32. """
  33. vocab = Dictionary()
  34. vocab.add_symbol("he@@")
  35. vocab.add_symbol("llo")
  36. vocab.add_symbol("how")
  37. vocab.add_symbol("are")
  38. vocab.add_symbol("y@@")
  39. vocab.add_symbol("ou")
  40. vocab.add_symbol("n@@")
  41. vocab.add_symbol("ew")
  42. vocab.add_symbol("or@@")
  43. vocab.add_symbol("k")
  44. src_tokens = [
  45. ["he@@", "llo", "n@@", "ew", "y@@", "or@@", "k"],
  46. ["how", "are", "y@@", "ou"],
  47. ]
  48. x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
  49. vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
  50. )
  51. return vocab, x, src_lengths
  52. def _get_test_data_with_bpe_end_marker(self, append_eos=True):
  53. """
  54. Args:
  55. append_eos: if True, each input sentence in the source tokens tensor
  56. will have an EOS appended to the end.
  57. Returns:
  58. vocabs: BPE vocab with end-of-word markers as suffixes to denote
  59. tokens at the end of a word. This is an alternative to fairseq's
  60. standard preprocessing framework and is not generally supported
  61. within fairseq.
  62. x: input tensor containing numberized source tokens, with EOS at the
  63. end if append_eos is true
  64. src_lengths: and source lengths.
  65. """
  66. vocab = Dictionary()
  67. vocab.add_symbol("he")
  68. vocab.add_symbol("llo_EOW")
  69. vocab.add_symbol("how_EOW")
  70. vocab.add_symbol("are_EOW")
  71. vocab.add_symbol("y")
  72. vocab.add_symbol("ou_EOW")
  73. vocab.add_symbol("n")
  74. vocab.add_symbol("ew_EOW")
  75. vocab.add_symbol("or")
  76. vocab.add_symbol("k_EOW")
  77. src_tokens = [
  78. ["he", "llo_EOW", "n", "ew_EOW", "y", "or", "k_EOW"],
  79. ["how_EOW", "are_EOW", "y", "ou_EOW"],
  80. ]
  81. x, src_lengths = x, src_lengths = self._convert_src_tokens_to_tensor(
  82. vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
  83. )
  84. return vocab, x, src_lengths
  85. def _get_test_data_with_word_vocab(self, append_eos=True):
  86. """
  87. Args:
  88. append_eos: if True, each input sentence in the source tokens tensor
  89. will have an EOS appended to the end.
  90. Returns:
  91. vocabs: word vocab
  92. x: input tensor containing numberized source tokens, with EOS at the
  93. end if append_eos is true
  94. src_lengths: and source lengths.
  95. """
  96. vocab = Dictionary()
  97. vocab.add_symbol("hello")
  98. vocab.add_symbol("how")
  99. vocab.add_symbol("are")
  100. vocab.add_symbol("you")
  101. vocab.add_symbol("new")
  102. vocab.add_symbol("york")
  103. src_tokens = [
  104. ["hello", "new", "york", "you"],
  105. ["how", "are", "you", "new", "york"],
  106. ]
  107. x, src_lengths = self._convert_src_tokens_to_tensor(
  108. vocab=vocab, src_tokens=src_tokens, append_eos=append_eos
  109. )
  110. return vocab, x, src_lengths
  111. def _convert_src_tokens_to_tensor(
  112. self, vocab: Dictionary, src_tokens: List[List[str]], append_eos: bool
  113. ):
  114. src_len = [len(x) for x in src_tokens]
  115. # If we have to append EOS, we include EOS in counting src length
  116. if append_eos:
  117. src_len = [length + 1 for length in src_len]
  118. x = torch.LongTensor(len(src_tokens), max(src_len)).fill_(vocab.pad())
  119. for i in range(len(src_tokens)):
  120. for j in range(len(src_tokens[i])):
  121. x[i][j] = vocab.index(src_tokens[i][j])
  122. if append_eos:
  123. x[i][j + 1] = vocab.eos()
  124. x = x.transpose(1, 0)
  125. return x, torch.LongTensor(src_len)
  126. def assert_eos_at_end(self, x, x_len, eos):
  127. """Asserts last token of every sentence in x is EOS """
  128. for i in range(len(x_len)):
  129. self.assertEqual(
  130. x[x_len[i] - 1][i],
  131. eos,
  132. (
  133. "Expected eos (token id {eos}) at the end of sentence {i} "
  134. "but got {other} instead"
  135. ).format(i=i, eos=eos, other=x[i][-1]),
  136. )
  137. def assert_word_dropout_correct(self, x, x_noised, x_len, l_noised):
  138. # Expect only the first word (2 bpe tokens) of the first example
  139. # was dropped out
  140. self.assertEqual(x_len[0] - 2, l_noised[0])
  141. for i in range(l_noised[0]):
  142. self.assertEqual(x_noised[i][0], x[i + 2][0])
  143. def test_word_dropout_with_eos(self):
  144. vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
  145. with data_utils.numpy_seed(1234):
  146. noising_gen = noising.WordDropout(vocab)
  147. x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
  148. self.assert_word_dropout_correct(
  149. x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
  150. )
  151. self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
  152. def assert_word_blanking_correct(self, x, x_noised, x_len, l_noised, unk):
  153. # Expect only the first word (2 bpe tokens) of the first example
  154. # was blanked out
  155. self.assertEqual(x_len[0], l_noised[0])
  156. for i in range(l_noised[0]):
  157. if i < 2:
  158. self.assertEqual(x_noised[i][0], unk)
  159. else:
  160. self.assertEqual(x_noised[i][0], x[i][0])
  161. def test_word_blank_with_eos(self):
  162. vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
  163. with data_utils.numpy_seed(1234):
  164. noising_gen = noising.WordDropout(vocab)
  165. x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
  166. self.assert_word_blanking_correct(
  167. x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
  168. )
  169. self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
  170. def generate_unchanged_shuffle_map(self, length):
  171. return {i: i for i in range(length)}
  172. def assert_word_shuffle_matches_expected(
  173. self,
  174. x,
  175. x_len,
  176. max_shuffle_distance: int,
  177. vocab: Dictionary,
  178. expected_shufle_maps: List[Dict[int, int]],
  179. expect_eos_at_end: bool,
  180. bpe_end_marker=None,
  181. ):
  182. """
  183. This verifies that with a given x, x_len, max_shuffle_distance, and
  184. vocab, we get the expected shuffle result.
  185. Args:
  186. x: Tensor of shape (T x B) = (sequence_length, batch_size)
  187. x_len: Tensor of length B = batch_size
  188. max_shuffle_distance: arg to pass to noising
  189. expected_shuffle_maps: List[mapping] where mapping is a
  190. Dict[old_index, new_index], mapping x's elements from their
  191. old positions in x to their new positions in x.
  192. expect_eos_at_end: if True, check the output to make sure there is
  193. an EOS at the end.
  194. bpe_end_marker: str denoting the BPE end token. If this is not None, we
  195. set the BPE cont token to None in the noising classes.
  196. """
  197. bpe_cont_marker = None
  198. if bpe_end_marker is None:
  199. bpe_cont_marker = "@@"
  200. with data_utils.numpy_seed(1234):
  201. word_shuffle = noising.WordShuffle(
  202. vocab, bpe_cont_marker=bpe_cont_marker, bpe_end_marker=bpe_end_marker
  203. )
  204. x_noised, l_noised = word_shuffle.noising(
  205. x, x_len, max_shuffle_distance=max_shuffle_distance
  206. )
  207. # For every example, we have a different expected shuffle map. We check
  208. # that each example is shuffled as expected according to each
  209. # corresponding shuffle map.
  210. for i in range(len(expected_shufle_maps)):
  211. shuffle_map = expected_shufle_maps[i]
  212. for k, v in shuffle_map.items():
  213. self.assertEqual(x[k][i], x_noised[v][i])
  214. # Shuffling should not affect the length of each example
  215. for pre_shuffle_length, post_shuffle_length in zip(x_len, l_noised):
  216. self.assertEqual(pre_shuffle_length, post_shuffle_length)
  217. if expect_eos_at_end:
  218. self.assert_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
  219. def test_word_shuffle_with_eos(self):
  220. vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=True)
  221. # Assert word shuffle with max shuffle distance 0 causes input to be
  222. # unchanged
  223. self.assert_word_shuffle_matches_expected(
  224. x=x,
  225. x_len=x_len,
  226. max_shuffle_distance=0,
  227. vocab=vocab,
  228. expected_shufle_maps=[
  229. self.generate_unchanged_shuffle_map(example_len)
  230. for example_len in x_len
  231. ],
  232. expect_eos_at_end=True,
  233. )
  234. # Assert word shuffle with max shuffle distance 3 matches our expected
  235. # shuffle order
  236. self.assert_word_shuffle_matches_expected(
  237. x=x,
  238. x_len=x_len,
  239. vocab=vocab,
  240. max_shuffle_distance=3,
  241. expected_shufle_maps=[
  242. self.generate_unchanged_shuffle_map(x_len[0]),
  243. {0: 0, 1: 3, 2: 1, 3: 2},
  244. ],
  245. expect_eos_at_end=True,
  246. )
  247. def test_word_shuffle_with_eos_nonbpe(self):
  248. """The purpose of this is to test shuffling logic with word vocabs"""
  249. vocab, x, x_len = self._get_test_data_with_word_vocab(append_eos=True)
  250. # Assert word shuffle with max shuffle distance 0 causes input to be
  251. # unchanged
  252. self.assert_word_shuffle_matches_expected(
  253. x=x,
  254. x_len=x_len,
  255. max_shuffle_distance=0,
  256. vocab=vocab,
  257. expected_shufle_maps=[
  258. self.generate_unchanged_shuffle_map(example_len)
  259. for example_len in x_len
  260. ],
  261. expect_eos_at_end=True,
  262. )
  263. # Assert word shuffle with max shuffle distance 3 matches our expected
  264. # shuffle order
  265. self.assert_word_shuffle_matches_expected(
  266. x=x,
  267. x_len=x_len,
  268. vocab=vocab,
  269. max_shuffle_distance=3,
  270. expected_shufle_maps=[
  271. {0: 0, 1: 1, 2: 3, 3: 2},
  272. {0: 0, 1: 2, 2: 1, 3: 3, 4: 4},
  273. ],
  274. expect_eos_at_end=True,
  275. )
  276. def test_word_shuffle_without_eos(self):
  277. """Same result as word shuffle with eos except no EOS at end"""
  278. vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
  279. # Assert word shuffle with max shuffle distance 0 causes input to be
  280. # unchanged
  281. self.assert_word_shuffle_matches_expected(
  282. x=x,
  283. x_len=x_len,
  284. max_shuffle_distance=0,
  285. vocab=vocab,
  286. expected_shufle_maps=[
  287. self.generate_unchanged_shuffle_map(example_len)
  288. for example_len in x_len
  289. ],
  290. expect_eos_at_end=False,
  291. )
  292. # Assert word shuffle with max shuffle distance 3 matches our expected
  293. # shuffle order
  294. self.assert_word_shuffle_matches_expected(
  295. x=x,
  296. x_len=x_len,
  297. vocab=vocab,
  298. max_shuffle_distance=3,
  299. expected_shufle_maps=[
  300. self.generate_unchanged_shuffle_map(x_len[0]),
  301. {0: 0, 1: 3, 2: 1, 3: 2},
  302. ],
  303. expect_eos_at_end=False,
  304. )
  305. def test_word_shuffle_without_eos_with_bpe_end_marker(self):
  306. """Same result as word shuffle without eos except using BPE end token"""
  307. vocab, x, x_len = self._get_test_data_with_bpe_end_marker(append_eos=False)
  308. # Assert word shuffle with max shuffle distance 0 causes input to be
  309. # unchanged
  310. self.assert_word_shuffle_matches_expected(
  311. x=x,
  312. x_len=x_len,
  313. max_shuffle_distance=0,
  314. vocab=vocab,
  315. expected_shufle_maps=[
  316. self.generate_unchanged_shuffle_map(example_len)
  317. for example_len in x_len
  318. ],
  319. expect_eos_at_end=False,
  320. bpe_end_marker="_EOW",
  321. )
  322. # Assert word shuffle with max shuffle distance 3 matches our expected
  323. # shuffle order
  324. self.assert_word_shuffle_matches_expected(
  325. x=x,
  326. x_len=x_len,
  327. vocab=vocab,
  328. max_shuffle_distance=3,
  329. expected_shufle_maps=[
  330. self.generate_unchanged_shuffle_map(x_len[0]),
  331. {0: 0, 1: 3, 2: 1, 3: 2},
  332. ],
  333. expect_eos_at_end=False,
  334. bpe_end_marker="_EOW",
  335. )
  336. def assert_no_eos_at_end(self, x, x_len, eos):
  337. """Asserts that the last token of each sentence in x is not EOS """
  338. for i in range(len(x_len)):
  339. self.assertNotEqual(
  340. x[x_len[i] - 1][i],
  341. eos,
  342. "Expected no eos (token id {eos}) at the end of sentence {i}.".format(
  343. eos=eos, i=i
  344. ),
  345. )
  346. def test_word_dropout_without_eos(self):
  347. """Same result as word dropout with eos except no EOS at end"""
  348. vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
  349. with data_utils.numpy_seed(1234):
  350. noising_gen = noising.WordDropout(vocab)
  351. x_noised, l_noised = noising_gen.noising(x, x_len, 0.2)
  352. self.assert_word_dropout_correct(
  353. x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised
  354. )
  355. self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
  356. def test_word_blank_without_eos(self):
  357. """Same result as word blank with eos except no EOS at end"""
  358. vocab, x, x_len = self._get_test_data_with_bpe_cont_marker(append_eos=False)
  359. with data_utils.numpy_seed(1234):
  360. noising_gen = noising.WordDropout(vocab)
  361. x_noised, l_noised = noising_gen.noising(x, x_len, 0.2, vocab.unk())
  362. self.assert_word_blanking_correct(
  363. x=x, x_noised=x_noised, x_len=x_len, l_noised=l_noised, unk=vocab.unk()
  364. )
  365. self.assert_no_eos_at_end(x=x_noised, x_len=l_noised, eos=vocab.eos())
  366. def _get_noising_dataset_batch(
  367. self, src_tokens_no_pad, src_dict, append_eos_to_tgt=False,
  368. ):
  369. """
  370. Constructs a NoisingDataset and the corresponding
  371. ``LanguagePairDataset(NoisingDataset(src), src)``. If
  372. *append_eos_to_tgt* is True, wrap the source dataset in
  373. :class:`TransformEosDataset` to append EOS to the clean source when
  374. using it as the target.
  375. """
  376. src_dataset = test_utils.TestDataset(data=src_tokens_no_pad)
  377. noising_dataset = noising.NoisingDataset(
  378. src_dataset=src_dataset,
  379. src_dict=src_dict,
  380. seed=1234,
  381. max_word_shuffle_distance=3,
  382. word_dropout_prob=0.2,
  383. word_blanking_prob=0.2,
  384. noising_class=noising.UnsupervisedMTNoising,
  385. )
  386. tgt = src_dataset
  387. language_pair_dataset = LanguagePairDataset(
  388. src=noising_dataset, tgt=tgt, src_sizes=None, src_dict=src_dict
  389. )
  390. language_pair_dataset = TransformEosDataset(
  391. language_pair_dataset, src_dict.eos(),
  392. append_eos_to_tgt=append_eos_to_tgt,
  393. )
  394. dataloader = torch.utils.data.DataLoader(
  395. dataset=language_pair_dataset,
  396. batch_size=2,
  397. collate_fn=language_pair_dataset.collater,
  398. )
  399. denoising_batch_result = next(iter(dataloader))
  400. return denoising_batch_result
  401. def test_noising_dataset_with_eos(self):
  402. src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
  403. append_eos=True
  404. )
  405. # Format data for src_dataset
  406. src_tokens = torch.t(src_tokens)
  407. src_tokens_no_pad = []
  408. for src_sentence in src_tokens:
  409. src_tokens_no_pad.append(
  410. utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
  411. )
  412. denoising_batch_result = self._get_noising_dataset_batch(
  413. src_tokens_no_pad=src_tokens_no_pad, src_dict=src_dict
  414. )
  415. eos, pad = src_dict.eos(), src_dict.pad()
  416. # Generated noisy source as source
  417. expected_src = torch.LongTensor(
  418. [[4, 5, 10, 11, 8, 12, 13, eos], [pad, pad, pad, 6, 8, 9, 7, eos]]
  419. )
  420. # Original clean source as target (right-padded)
  421. expected_tgt = torch.LongTensor(
  422. [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
  423. )
  424. generated_src = denoising_batch_result["net_input"]["src_tokens"]
  425. tgt_tokens = denoising_batch_result["target"]
  426. self.assertTensorEqual(expected_src, generated_src)
  427. self.assertTensorEqual(expected_tgt, tgt_tokens)
  428. def test_noising_dataset_without_eos(self):
  429. """
  430. Similar to test noising dataset with eos except that we have to set
  431. *append_eos_to_tgt* to ``True``.
  432. """
  433. src_dict, src_tokens, _ = self._get_test_data_with_bpe_cont_marker(
  434. append_eos=False
  435. )
  436. # Format data for src_dataset
  437. src_tokens = torch.t(src_tokens)
  438. src_tokens_no_pad = []
  439. for src_sentence in src_tokens:
  440. src_tokens_no_pad.append(
  441. utils.strip_pad(tensor=src_sentence, pad=src_dict.pad())
  442. )
  443. denoising_batch_result = self._get_noising_dataset_batch(
  444. src_tokens_no_pad=src_tokens_no_pad,
  445. src_dict=src_dict,
  446. append_eos_to_tgt=True,
  447. )
  448. eos, pad = src_dict.eos(), src_dict.pad()
  449. # Generated noisy source as source
  450. expected_src = torch.LongTensor(
  451. [[4, 5, 10, 11, 8, 12, 13], [pad, pad, pad, 6, 8, 9, 7]]
  452. )
  453. # Original clean source as target (right-padded)
  454. expected_tgt = torch.LongTensor(
  455. [[4, 5, 10, 11, 8, 12, 13, eos], [6, 7, 8, 9, eos, pad, pad, pad]]
  456. )
  457. generated_src = denoising_batch_result["net_input"]["src_tokens"]
  458. tgt_tokens = denoising_batch_result["target"]
  459. self.assertTensorEqual(expected_src, generated_src)
  460. self.assertTensorEqual(expected_tgt, tgt_tokens)
  461. def assertTensorEqual(self, t1, t2):
  462. self.assertEqual(t1.size(), t2.size(), "size mismatch")
  463. self.assertEqual(t1.ne(t2).long().sum(), 0)
  464. if __name__ == "__main__":
  465. unittest.main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...