Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

prepare.py 3.1 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
  1. # saves the openwebtext dataset to a binary file for training. following was helpful:
  2. # https://github.com/HazyResearch/flash-attention/blob/main/training/src/datamodules/language_modeling_hf.py
  3. import os
  4. from tqdm import tqdm
  5. import numpy as np
  6. import tiktoken
  7. from datasets import load_dataset # huggingface datasets
  8. # number of workers in .map() call
  9. # good number to use is ~order number of cpu cores // 2
  10. num_proc = 8
  11. # number of workers in load_dataset() call
  12. # best number might be different from num_proc above as it also depends on NW speed.
  13. # it is better than 1 usually though
  14. num_proc_load_dataset = num_proc
  15. enc = tiktoken.get_encoding("gpt2")
  16. if __name__ == '__main__':
  17. # takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
  18. dataset = load_dataset("openwebtext", num_proc=num_proc_load_dataset)
  19. # owt by default only contains the 'train' split, so create a test split
  20. split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
  21. split_dataset['val'] = split_dataset.pop('test') # rename the test split to val
  22. # this results in:
  23. # >>> split_dataset
  24. # DatasetDict({
  25. # train: Dataset({
  26. # features: ['text'],
  27. # num_rows: 8009762
  28. # })
  29. # val: Dataset({
  30. # features: ['text'],
  31. # num_rows: 4007
  32. # })
  33. # })
  34. # we now want to tokenize the dataset. first define the encoding function (gpt2 bpe)
  35. def process(example):
  36. ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
  37. ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
  38. # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
  39. out = {'ids': ids, 'len': len(ids)}
  40. return out
  41. # tokenize the dataset
  42. tokenized = split_dataset.map(
  43. process,
  44. remove_columns=['text'],
  45. desc="tokenizing the splits",
  46. num_proc=num_proc,
  47. )
  48. # concatenate all the ids in each dataset into one large file we can use for training
  49. for split, dset in tokenized.items():
  50. arr_len = np.sum(dset['len'], dtype=np.uint64)
  51. filename = os.path.join(os.path.dirname(__file__), f'{split}.bin')
  52. dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
  53. arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))
  54. total_batches = 1024
  55. idx = 0
  56. for batch_idx in tqdm(range(total_batches), desc=f'writing {filename}'):
  57. # Batch together samples for faster write
  58. batch = dset.shard(num_shards=total_batches, index=batch_idx, contiguous=True).with_format('numpy')
  59. arr_batch = np.concatenate(batch['ids'])
  60. # Write into mmap
  61. arr[idx : idx + len(arr_batch)] = arr_batch
  62. idx += len(arr_batch)
  63. arr.flush()
  64. # train.bin is ~17GB, val.bin ~8.5MB
  65. # train has ~9B tokens (9,035,582,198)
  66. # val has ~4M tokens (4,434,897)
  67. # to read the bin files later, e.g. with numpy:
  68. # m = np.memmap('train.bin', dtype=np.uint16, mode='r')
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...