Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

albert_dataloaders.py 1.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
  1. import yaml
  2. import torch
  3. from functools import partial
  4. from transformers import AlbertTokenizer
  5. from torch.utils.data import DataLoader
  6. from utils import preprocess
  7. with open('params.yaml', 'r') as f:
  8. PARAMS = yaml.safe_load(f)
  9. if torch.cuda.is_available():
  10. DEVICE = torch.device('cuda', PARAMS.get('gpu', 0))
  11. else:
  12. DEVICE = torch.device('cpu')
  13. class DataFrameDataLoader(DataLoader):
  14. def __init__(self, df, max_len, pretrained_model, do_lower_case, *args, **kwargs):
  15. # order is text, label
  16. self._tokenizer = AlbertTokenizer.from_pretrained(pretrained_model, do_lower_case=do_lower_case)
  17. self._data_iter = list(zip(df['review'], df['sentiment']))
  18. collate_batch = partial(self.collate_batch, max_len=max_len)
  19. super(DataFrameDataLoader, self).__init__(self._data_iter, collate_fn=collate_batch, *args, **kwargs)
  20. def collate_batch(self, batch, max_len):
  21. label_list, text_list = [], []
  22. attention_masks = []
  23. for (_text, _label) in batch:
  24. label_list.append(_label)
  25. encoded_dict = self._tokenizer.encode_plus(
  26. preprocess.preprocess_text(_text, remove_punc=False), add_special_tokens=True, max_length=max_len,
  27. pad_to_max_length=True, return_attention_mask=True,
  28. return_tensors='pt'
  29. )
  30. text_list.append(encoded_dict['input_ids'])
  31. attention_masks.append(encoded_dict['attention_mask'])
  32. label_list = torch.tensor(label_list, dtype=torch.float32)
  33. text_list = torch.cat(text_list, dim=0)
  34. attention_masks = torch.cat(attention_masks, dim=0)
  35. return label_list.to(DEVICE), text_list.to(DEVICE), attention_masks.to(DEVICE)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...