Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

vectorizer.py 1.7 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
  1. from typing import List
  2. import torch
  3. from torch import Tensor
  4. # import torchnlp.word_to_vector.pretrained_word_vectors._PretrainedWordVectors as BaseWordVectorizer
  5. from bald.vocab import Vocab
  6. class Vectorizer:
  7. def __init__(self, vocab: Vocab):
  8. self.vocab = vocab
  9. def pre_vectorize(self,sequence: List[str]) -> List[int]:
  10. out = [self.vocab.lookup_id(self.vocab.bos)]
  11. out.extend([self.vocab.lookup_id(token) for token in sequence])
  12. out.append(self.vocab.lookup_id(self.vocab.eos))
  13. return out
  14. def vectorize(self,sequence: List[str]) -> Tensor:
  15. pre_vectorized = self.pre_vectorize(sequence)
  16. pre_vectorized = [Tensor(token) for token in pre_vectorized]
  17. return torch.stack(pre_vectorized)
  18. class LabelVectorizer:
  19. '''
  20. Class to vectorize the NER tags
  21. '''
  22. encoding = {
  23. 'O':0,
  24. 'B-PER':1,
  25. 'I-PER':1,
  26. 'B-ORG':2,
  27. 'I-ORG':2,
  28. 'B-LOC':3,
  29. 'I-LOC':3,
  30. 'B-MISC':4,
  31. 'I-MISC':4,
  32. }
  33. @classmethod
  34. def pre_vectorize(cls,sequence: List[str]) -> List[int]:
  35. out = [0]
  36. middle = [cls.encoding[tag] for tag in sequence]
  37. out.extend(middle)
  38. out.append(0)
  39. return out
  40. @classmethod
  41. def vectorize(cls,sequence: List[str]) -> Tensor:
  42. pre_vectorized = cls.pre_vectorize(sequence)
  43. pre_vectorized = [Tensor(token) for token in pre_vectorized]
  44. return torch.stack(pre_vectorized)
  45. class WordVectorizer:
  46. def __init__(self, vectorizer: BaseWordVectorizer):
  47. self.vectorizer = vectorizer
  48. def vectorize(self, sequence: List[str]) -> Tensor:
  49. return self.vectorizer(text)
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...