Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

project.yml 3.2 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
  1. title: "Pre-annotate data with Regex"
  2. description: "In this project we pre-annotate some data (labdocs) with regex patterns, thanks to Spacy, in order to correct them with Label Studio. The resulting annotated dataset is used (in other project) train a Spacy Ner model "
  3. # =======================================vars=======================================================================================
  4. vars:
  5. # A number of sample tex labdocs to annotate
  6. user : "root"
  7. password : 11950022
  8. host : "localhost"
  9. database : "Labnbook"
  10. sample_size : 500
  11. train_size : 0.75
  12. # The name of the jsonl file of the text labdocs
  13. name : "labdoc_init"
  14. lang: "fr"
  15. #regex patterns for cleaning text labdocs
  16. regex_text: "regex_text.json"
  17. #regex patterns for sampling labdocs labels
  18. regex_ner : "regex_ner.json"
  19. train: "train"
  20. dev: "dev"
  21. # =======================================directories=======================================================================================
  22. directories: ["source","scripts","data","configs"]
  23. # Files that should be available in the directory
  24. configs:
  25. - dest: "configs/${vars.regex_ner}"
  26. description: "Regex patterns for annotate labels"
  27. - dest: "configs/${vars.regex_text}"
  28. description: "Regex patterns for clean text"
  29. # ============================================workflows=======================================================================================
  30. workflows:
  31. get_labdocs:
  32. - get_init
  33. - get_sample
  34. - get_train
  35. # ============================================commands====================================================================================
  36. commands:
  37. - name: "get_init"
  38. help: "Get all cleaned text Labdocs according to the regex patterns ${vars.regex_text}. We extract one Labdoc from each report. See the 'clean_text' function for more details about the applied clean."
  39. script:
  40. - "python3 scripts/get_init.py ${vars.user} ${vars.host} ${vars.database} ${vars.password} ${vars.name} configs/${vars.regex_text}"
  41. outputs:
  42. - "source/${vars.name}.jsonl"
  43. deps:
  44. - "configs/${vars.regex_text}"
  45. ## ----------------------------------------------------------------------------------------------------------------------
  46. - name: "get_sample"
  47. help: "Get a sample of Labdocs with equations and table "
  48. script:
  49. - "python3 scripts/get_sample.py source/${vars.name}.jsonl source/${vars.name}_sample.jsonl ${vars.sample_size} configs/${vars.regex_ner}"
  50. outputs:
  51. - "source/${vars.name}_sample.jsonl"
  52. deps:
  53. - "source/${vars.name}.jsonl"
  54. ## ----------------------------------------------------------------------------------------------------------------------
  55. - name: "get_train"
  56. help: "Get a sample of Labdocs according to the Regex patterns in ${vars.regex_ner} for training and testing a NER model from scratch"
  57. script:
  58. - "python3 scripts/get_train.py ${vars.lang} source/${vars.name}_sample.jsonl ${vars.train_size} configs/${vars.regex_ner}"
  59. outputs:
  60. # - data/${vars.dev}.json
  61. # - data/${vars.train}.json
  62. # - data/${vars.dev}.spacy
  63. # - data/${vars.dev}.spacy
  64. - data/all.spacy
  65. - data/all.json
  66. deps:
  67. - "source/${vars.name}_sample.jsonl"
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...