urszulaczerwinska
/
seqtagger


  
1

	
2

	
3

	
4

	
5

	
6

	
7

	
8

	
9

	
10

	
11

	
12

	
13

	
14

	
15

	
16

	
17

	
18

	
19

	
20

	
21

	
22

	
23

	
24

	
25

	
26

	
27

	
28

	
29

	
30

	
31

	
32

	
33

	
34

	
35

	
36

	
37

	
38

	
39

	
40

	
41

	
42

	
43

	
44

	
45

	
46

	
47

	
48

	
49

	
50

	
51

	
52

	
53

	
54

	
55

	
56

	
57

	
58

	
59

	
60

	
61

	
62

	
63

	
64

	
65

	
66

	
67

	
68

	
69

	
70

	
71

	
72

	
73

	
74

	
75

	
76

	
77

	
78

	
79

	
80

	
81

	
82

	
83

	
84

	
85

	
86

	
87

	
88

	
89

	
90

	
91

	
92

	
93

	
94

	
95

	
96

	
97

	
98

	
            import os
from flair.data import Corpus
from flair.datasets import ColumnCorpus
from flair.embeddings import *
from flair.trainers import ModelTrainer
from typing import List
from flair.models import SequenceTagger
from flair.trainers import ModelTrainer
import ast
from torch.utils.tensorboard import SummaryWriter
import yaml
import json
from shutil import copyfile, move, rmtree, copy2
from extract_score import extract_score

pram_file = "./params/seqtag_params.yaml"
metrics_file = "./metrics/seqtag_metrics.json"
plots_file = "./metrics"
tensorboard_folder = "tensorboard/"


# import params
with open(pram_file, 'r') as fd:
    params = yaml.safe_load(fd)


# 1. get the corpus

# define columns
# columns = {0: 'token', 1: 'tag', 2: 'space-after'}
columns = params["corpus"]["columns"]

# this is the folder in which train, test and dev files reside
data_folder = params["corpus"]["data_folder"]

train_file = params["corpus"]["train_file"]
test_file = params["corpus"]["test_file"]
dev_file = params["corpus"]["dev_file"]

# init a corpus using column format, data folder and the names of the train, dev and test files
corpus: Corpus = ColumnCorpus(data_folder, columns,
                              train_file=train_file,
                              test_file=test_file,
                              dev_file=dev_file)

# 2. what tag do we want to predict?
tag_type = 'ner'

# 3. make the tag dictionary from the corpus
tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)

# 4. initialize embeddings
embedding_types: List[TokenEmbeddings] =[]

if "flair-backward" in params["embeddings"].keys():
    embedding_types.append(FlairEmbeddings(params["embeddings"]["flair-backward"], chars_per_chunk=512))

if "fasttext" in params["embeddings"].keys():
    embedding_types.append(WordEmbeddings(params["embeddings"]["fasttext"]))

embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)

# 5. initialize sequence tagger
tagger: SequenceTagger = SequenceTagger(embeddings=embeddings,
                                        tag_dictionary=tag_dictionary,
                                        tag_type=tag_type,
                                        **params["seqtagger"]
                                        )

# 6. initialize trainer
trainer: ModelTrainer = ModelTrainer(tagger, corpus, **params["trainer"])

path = params["train"]["base_path"]

if params["type"]["is_fresh_start"]:
    try:
        rmtree(path, ignore_errors=True)
    except OSError as e:
        print("Error: %s : %s" % (path, e.strerror))

    try:
        rmtree(tensorboard_folder, ignore_errors=True)
    except OSError as e:
        print("Error: %s : %s" % (tensorboard_folder+"*", e.strerror))
# 7. start training
trainer.train(**params["train"])

# 8. eval
# run evaluation procedure

result, _ = tagger.evaluate(corpus.test, out_path=f"{path}/predictions.txt")
print(result.log_line)
jsonfile = open(metrics_file, 'w')
det_res = extract_score(result)
json.dump(det_res, jsonfile)

copy2(path+"loss.tsv", plots_file)
move("./runs/", tensorboard_folder + params["type"]["run_name"]+"/")