Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

build_sym_alignment.py 3.8 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
  1. # Copyright (c) 2017-present, Facebook, Inc.
  2. # All rights reserved.
  3. #
  4. # This source code is licensed under the license found in the LICENSE file in
  5. # the root directory of this source tree. An additional grant of patent rights
  6. # can be found in the PATENTS file in the same directory.
  7. #
  8. """
  9. Use this script in order to build symmetric alignments for your translation
  10. dataset.
  11. This script depends on fast_align and mosesdecoder tools. You will need to
  12. build those before running the script.
  13. fast_align:
  14. github: http://github.com/clab/fast_align
  15. instructions: follow the instructions in README.md
  16. mosesdecoder:
  17. github: http://github.com/moses-smt/mosesdecoder
  18. instructions: http://www.statmt.org/moses/?n=Development.GetStarted
  19. The script produces the following files under --output_dir:
  20. text.joined - concatenation of lines from the source_file and the
  21. target_file.
  22. align.forward - forward pass of fast_align.
  23. align.backward - backward pass of fast_align.
  24. aligned.sym_heuristic - symmetrized alignment.
  25. """
  26. import argparse
  27. import os
  28. from itertools import zip_longest
  29. def main():
  30. parser = argparse.ArgumentParser(description='symmetric alignment builer')
  31. # fmt: off
  32. parser.add_argument('--fast_align_dir',
  33. help='path to fast_align build directory')
  34. parser.add_argument('--mosesdecoder_dir',
  35. help='path to mosesdecoder root directory')
  36. parser.add_argument('--sym_heuristic',
  37. help='heuristic to use for symmetrization',
  38. default='grow-diag-final-and')
  39. parser.add_argument('--source_file',
  40. help='path to a file with sentences '
  41. 'in the source language')
  42. parser.add_argument('--target_file',
  43. help='path to a file with sentences '
  44. 'in the target language')
  45. parser.add_argument('--output_dir',
  46. help='output directory')
  47. # fmt: on
  48. args = parser.parse_args()
  49. fast_align_bin = os.path.join(args.fast_align_dir, 'fast_align')
  50. symal_bin = os.path.join(args.mosesdecoder_dir, 'bin', 'symal')
  51. sym_fast_align_bin = os.path.join(
  52. args.mosesdecoder_dir, 'scripts', 'ems',
  53. 'support', 'symmetrize-fast-align.perl')
  54. # create joined file
  55. joined_file = os.path.join(args.output_dir, 'text.joined')
  56. with open(args.source_file, 'r', encoding='utf-8') as src, open(args.target_file, 'r', encoding='utf-8') as tgt:
  57. with open(joined_file, 'w', encoding='utf-8') as joined:
  58. for s, t in zip_longest(src, tgt):
  59. print('{} ||| {}'.format(s.strip(), t.strip()), file=joined)
  60. bwd_align_file = os.path.join(args.output_dir, 'align.backward')
  61. # run forward alignment
  62. fwd_align_file = os.path.join(args.output_dir, 'align.forward')
  63. fwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v > {FWD}'.format(
  64. FASTALIGN=fast_align_bin,
  65. JOINED=joined_file,
  66. FWD=fwd_align_file)
  67. assert os.system(fwd_fast_align_cmd) == 0
  68. # run backward alignment
  69. bwd_align_file = os.path.join(args.output_dir, 'align.backward')
  70. bwd_fast_align_cmd = '{FASTALIGN} -i {JOINED} -d -o -v -r > {BWD}'.format(
  71. FASTALIGN=fast_align_bin,
  72. JOINED=joined_file,
  73. BWD=bwd_align_file)
  74. assert os.system(bwd_fast_align_cmd) == 0
  75. # run symmetrization
  76. sym_out_file = os.path.join(args.output_dir, 'aligned')
  77. sym_cmd = '{SYMFASTALIGN} {FWD} {BWD} {SRC} {TGT} {OUT} {HEURISTIC} {SYMAL}'.format(
  78. SYMFASTALIGN=sym_fast_align_bin,
  79. FWD=fwd_align_file,
  80. BWD=bwd_align_file,
  81. SRC=args.source_file,
  82. TGT=args.target_file,
  83. OUT=sym_out_file,
  84. HEURISTIC=args.sym_heuristic,
  85. SYMAL=symal_bin
  86. )
  87. assert os.system(sym_cmd) == 0
  88. if __name__ == '__main__':
  89. main()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...