ML-Purdue
/
hackathonf23-Creativity_Underflow


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
            import numpy as np
import pickle
import torch
import tqdm
from torch.nn import functional as F

data = np.memmap("text_data.npy", dtype=np.float32, mode="w+", shape=(470_000, 20, 20, 256))

data_index = []  # (key, strings, data_index)
index = 0
for i in range(47):
    print(f"Loading data {i}...")
    with open(f"./train_data/data_text_saves{i}.pk", "rb") as f:
        text_data = pickle.load(f)
    for seg in tqdm.tqdm(text_data):
        data_index.append((seg[0], seg[1], index))
        text_segs = seg[2]
        text_segs = torch.from_numpy(text_segs)
        max_tokens = 20
        text_segs = F.pad(text_segs, pad=(0, 0, 0, max_tokens - text_segs.shape[0], 0, max_tokens - text_segs.shape[0]),
                          mode='constant', value=0)
        text_segs = text_segs[:max_tokens, :max_tokens, :]
        data[index] = text_segs
        index += 1
with open("./train_data/data_index.pk", "wb") as f:
    pickle.dump(data_index, f)
data.flush()