Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

vocab.py 1.5 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
  1. import json
  2. from typing import Dict
  3. class Vocab:
  4. def __init__(self, pre_made: Dict[str,int] = None):
  5. if pre_made:
  6. self.token_to_id = pre_made
  7. self.id_to_token = {pre_made[token] for token in pre_made}
  8. else:
  9. self.token_to_id = {}
  10. self.id_to_token = {}
  11. self.unk = "<UNK>"
  12. self.add_token(self.unk)
  13. self.bos = "<BOS>"
  14. self.add_token(self.bos)
  15. self.eos = "<EOS>"
  16. self.add_token(self.eos)
  17. def __len__(self):
  18. return len(self.token_to_id)
  19. def add_token(self,token: str) -> int:
  20. if token in self.token_to_id:
  21. index = self.token_to_id[token]
  22. else:
  23. index = len(self.token_to_id)
  24. self.token_to_id[token] = index
  25. self.id_to_token[index] = token
  26. return index
  27. def lookup_id(self,token: str) -> int:
  28. if token in self.token_to_id:
  29. return self.token_to_id[token]
  30. else:
  31. return self.token_to_id[self.unk]
  32. def lookup_token(self,j: int) -> str:
  33. if j in self.id_to_token.keys():
  34. return self.id_to_token[j]
  35. else:
  36. raise KeyError(f"{j} not a valid index.")
  37. def to_json(self,path: str):
  38. with open(path, 'w') as file:
  39. json.dump(self.token_to_id, file)
  40. @classmethod
  41. def from_json(cls,path: str):
  42. with open(path, 'r') as file:
  43. pre_made = json.load(file)
  44. return cls(pre_made)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...