Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

train_seq_tagger.py 3.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
  1. import os
  2. from flair.data import Corpus
  3. from flair.datasets import ColumnCorpus
  4. from flair.embeddings import *
  5. from flair.trainers import ModelTrainer
  6. from typing import List
  7. from flair.models import SequenceTagger
  8. from flair.trainers import ModelTrainer
  9. import ast
  10. from torch.utils.tensorboard import SummaryWriter
  11. import yaml
  12. import json
  13. from shutil import copyfile, move, rmtree, copy2
  14. from extract_score import extract_score
  15. pram_file = "./params/seqtag_params.yaml"
  16. metrics_file = "./metrics/seqtag_metrics.json"
  17. plots_file = "./metrics"
  18. tensorboard_folder = "tensorboard/"
  19. # import params
  20. with open(pram_file, 'r') as fd:
  21. params = yaml.safe_load(fd)
  22. # 1. get the corpus
  23. # define columns
  24. # columns = {0: 'token', 1: 'tag', 2: 'space-after'}
  25. columns = params["corpus"]["columns"]
  26. # this is the folder in which train, test and dev files reside
  27. data_folder = params["corpus"]["data_folder"]
  28. train_file = params["corpus"]["train_file"]
  29. test_file = params["corpus"]["test_file"]
  30. dev_file = params["corpus"]["dev_file"]
  31. # init a corpus using column format, data folder and the names of the train, dev and test files
  32. corpus: Corpus = ColumnCorpus(data_folder, columns,
  33. train_file=train_file,
  34. test_file=test_file,
  35. dev_file=dev_file)
  36. # 2. what tag do we want to predict?
  37. tag_type = 'ner'
  38. # 3. make the tag dictionary from the corpus
  39. tag_dictionary = corpus.make_tag_dictionary(tag_type=tag_type)
  40. # 4. initialize embeddings
  41. embedding_types: List[TokenEmbeddings] =[]
  42. if "flair-backward" in params["embeddings"].keys():
  43. embedding_types.append(FlairEmbeddings(params["embeddings"]["flair-backward"], chars_per_chunk=512))
  44. if "fasttext" in params["embeddings"].keys():
  45. embedding_types.append(WordEmbeddings(params["embeddings"]["fasttext"]))
  46. embeddings: StackedEmbeddings = StackedEmbeddings(embeddings=embedding_types)
  47. # 5. initialize sequence tagger
  48. tagger: SequenceTagger = SequenceTagger(embeddings=embeddings,
  49. tag_dictionary=tag_dictionary,
  50. tag_type=tag_type,
  51. **params["seqtagger"]
  52. )
  53. # 6. initialize trainer
  54. trainer: ModelTrainer = ModelTrainer(tagger, corpus, **params["trainer"])
  55. path = params["train"]["base_path"]
  56. if params["type"]["is_fresh_start"]:
  57. try:
  58. rmtree(path, ignore_errors=True)
  59. except OSError as e:
  60. print("Error: %s : %s" % (path, e.strerror))
  61. try:
  62. rmtree(tensorboard_folder, ignore_errors=True)
  63. except OSError as e:
  64. print("Error: %s : %s" % (tensorboard_folder+"*", e.strerror))
  65. # 7. start training
  66. trainer.train(**params["train"])
  67. # 8. eval
  68. # run evaluation procedure
  69. result, _ = tagger.evaluate(corpus.test, out_path=f"{path}/predictions.txt")
  70. print(result.log_line)
  71. jsonfile = open(metrics_file, 'w')
  72. det_res = extract_score(result)
  73. json.dump(det_res, jsonfile)
  74. copy2(path+"loss.tsv", plots_file)
  75. move("./runs/", tensorboard_folder + params["type"]["run_name"]+"/")
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...