1
|
- import os
import pickle
import tqdm
from PIL import Image
import torch
from torch.nn import functional as F
from transformers import AutoProcessor, GroupViTVisionModel, GroupViTTextModel, CLIPTokenizer
import settings
from utils.embeddingDataset import loadData
train = True
do_text = True
do_image = False
if train:
out_dir = settings.TrainSettings.Output.out_dir
else:
out_dir = settings.ValidationSettings.Output.out_dir
def batch(iterable, n=1):
l = len(iterable)
for ndx in range(0, l, n):
yield iterable[ndx:min(ndx + n, l)]
def processData():
# models
imgs, entities, captions, _ = loadData(train)
processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
model_image = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc").to("cuda")
tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
model_text = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc").to("cuda")
models_preprocess = []
saves = 0
with torch.no_grad():
if do_text:
for img in tqdm.tqdm(imgs):
key = int(os.path.split(img)[-1].split(".")[0])
strings = []
for index, caption in enumerate(captions[key][1:]):
for i in entities[key][index]:
strings.append(f"{entities[key][index][i]} {caption} {entities[key][index][i]}")
text_inputs = tokenizer(strings, padding=True, return_tensors="pt")
text_inputs["input_ids"] = text_inputs["input_ids"].to("cuda")
text_inputs["attention_mask"] = text_inputs["attention_mask"].to("cuda")
text_segs = model_text(input_ids=text_inputs["input_ids"], attention_mask=text_inputs["attention_mask"],
output_attentions=True)
text_segs = text_segs.last_hidden_state
# pad to make dim 20x20x256
i = 0
for index, caption in enumerate(captions[key][1:]):
segments = []
strings = []
for _ in entities[key][index]:
segment = text_segs[i]
i += 1
strings.append(f"{entities[key][index][_]} {caption} {entities[key][index][_]}")
segments.append(segment)
if len(segments):
text_segs_batch = torch.stack(segments)
text_segs_batch = F.pad(input=text_segs_batch, pad=(0, 0, 0, max(0, 20 - text_segs.size(1)),
0, max(0, 20 - text_segs.size(0))),
mode='constant', value=0).detach().cpu().numpy()
models_preprocess.append((key, strings, text_segs_batch))
if len(models_preprocess) % 10_000 == 0:
if os.path.exists(os.path.join(out_dir, f"data_text_saves{saves}.pk")):
os.remove(os.path.join(out_dir, f"data_text_saves{saves}.pk"))
with open(os.path.join(out_dir, f"data_text_saves{saves}.pk"), "wb") as f:
pickle.dump(models_preprocess, f)
print(f"Saved {len(models_preprocess)} to ./data_text_saves{saves}.pk")
saves += 1
models_preprocess = []
print()
print("Done processing text data")
with open(os.path.join(out_dir, "data_text.pk"), "wb") as f:
pickle.dump(models_preprocess, f)
print("Saved data to ./data_text.pk")
if do_image:
# get batches process imgs to batches
images_processed = {}
for b in tqdm.tqdm(batch(imgs, 32), total=len(imgs) // 32 + (1 if len(imgs) % 32 != 0 else 0)):
img_inputs = processor(images=[Image.open(img) for img in b], return_tensors="pt").to("cuda")
outputs = model_image(**img_inputs, output_attentions=True)
i = 0
for img in b:
key = int(os.path.split(img)[-1].split(".")[0])
images_processed[key] = outputs.attentions[0][i].detach().cpu().numpy()
i += 1
print()
print("Done processing image data")
with open(os.path.join(out_dir, "data_images.pk"), "wb") as f:
pickle.dump(images_processed, f)
print("Saved data to ./data_images.pk")
if __name__ == "__main__":
processData()
|