Dean
/
BioBERT-DAGsHub


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
99

	
100

	
101

	
102

	
103

	
104

	
105

	
106

	
107

	
108

	
109

	
110

	
111

	
112

	
113

	
114

	
115

	
116

	
117

	
118

	
119

	
120

	
121

	
122

	
123

	
124

	
125

	
126

	
127

	
128

	
129

	
130

	
131

	
132

	
133

	
134

	
135

	
136

	
137

	
138

	
139

	
140

	
141

	
142

	
143

	
144

	
145

	
146

	
147

	
148

	
149

	
            import logging
from dataclasses import dataclass
from typing import List, Dict, Optional
from transformers import PreTrainedTokenizer, is_torch_available
import pdb
from tqdm import tqdm
logger = logging.getLogger(__name__)

@dataclass
class InputFeatures:
    """
    A single set of features of data.
    Property names are the same names as the corresponding inputs to a model.
    """

    input_ids: List[int]
    attention_mask: List[int]
    token_type_ids: Optional[List[int]] = None
    metadata: Optional[dict] = None
    
def read_texts_from_file(file_path) -> List[str]:
    with open(file_path, encoding="utf-8") as f:
        texts = f.readlines()
    
    return texts

def convert_texts_to_features(
    texts: List[str],
    max_seq_length: int,
    tokenizer: PreTrainedTokenizer,
    cls_token="[CLS]",
    sep_token="[SEP]",
) -> dict:
    """ Convert text in .txt file into input features
    """

    features = []
    for t_idx, text in tqdm(enumerate(texts), total=len(texts)):
        tokens = tokenizer.tokenize(text)

        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
        special_tokens_count = tokenizer.num_special_tokens_to_add()
        if len(tokens) > max_seq_length - special_tokens_count:
            tokens = tokens[: (max_seq_length - special_tokens_count)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids:   0   0   0   0  0     0   0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambiguously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens += [sep_token]
        segment_ids = [0] * len(tokens)

        # CLS token
        tokens = [cls_token] + tokens
        segment_ids = [0] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding_length = max_seq_length - len(input_ids)
        input_ids += [0] * padding_length
        input_mask += [0] * padding_length
        segment_ids += [0] * padding_length

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        if t_idx<5:
            logger.info("*** Example ***")
            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))

        features.append(
            InputFeatures(
                input_ids=input_ids, 
                attention_mask=input_mask, 
                token_type_ids=segment_ids,
                metadata={
                    'text':text.strip(),
                    'text_id':str(t_idx),
                    'tokens': tokens
                }
            )
        )
    return features

if is_torch_available():
    import torch
    from torch import nn
    from torch.utils.data.dataset import Dataset

    class EmbeddingDataset(Dataset):
        """
        This will be superseded by a framework-agnostic approach
        soon.
        """

        features: List[InputFeatures]
        
        def __init__(
            self,
            data_path: str,
            tokenizer: PreTrainedTokenizer,
            max_seq_length,
        ):
            logger.info(f"Creating features from dataset file at {data_path}")
            texts = read_texts_from_file(data_path)
            self.features = convert_texts_to_features(
                texts = texts,
                max_seq_length = max_seq_length,
                tokenizer = tokenizer,
            )

        def __len__(self):
            return len(self.features)

        def __getitem__(self, i) -> InputFeatures:
            return self.features[i]
        
    def data_collator(features: List[InputFeatures]) -> Dict[str, torch.Tensor]:
        first = features[0]
        batch = {}
        for k, v in first.__dict__.items():
            if k == 'metadata':
                batch[k] = [f.__dict__[k] for f in features]
            else:
                batch[k] = torch.tensor([f.__dict__[k] for f in features], dtype=torch.long)
        return batch