Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

japanese.py 4.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
  1. import re
  2. from unidecode import unidecode
  3. import pyopenjtalk
  4. # Regular expression matching Japanese without punctuation marks:
  5. _japanese_characters = re.compile(
  6. r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
  7. # Regular expression matching non-Japanese characters or punctuation marks:
  8. _japanese_marks = re.compile(
  9. r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
  10. # List of (symbol, Japanese) pairs for marks:
  11. _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
  12. ('%', 'パーセント')
  13. ]]
  14. # List of (romaji, ipa) pairs for marks:
  15. _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
  16. ('ts', 'ʦ'),
  17. ('u', 'ɯ'),
  18. ('...', '…'),
  19. ('j', 'ʥ'),
  20. ('y', 'j'),
  21. ('ni', 'n^i'),
  22. ('nj', 'n^'),
  23. ('hi', 'çi'),
  24. ('hj', 'ç'),
  25. ('f', 'ɸ'),
  26. ('I', 'i*'),
  27. ('U', 'ɯ*'),
  28. ('r', 'ɾ')
  29. ]]
  30. # Dictinary of (consonant, sokuon) pairs:
  31. _real_sokuon = {
  32. 'k': 'k#',
  33. 'g': 'k#',
  34. 't': 't#',
  35. 'd': 't#',
  36. 'ʦ': 't#',
  37. 'ʧ': 't#',
  38. 'ʥ': 't#',
  39. 'j': 't#',
  40. 's': 's',
  41. 'ʃ': 's',
  42. 'p': 'p#',
  43. 'b': 'p#'
  44. }
  45. # Dictinary of (consonant, hatsuon) pairs:
  46. _real_hatsuon = {
  47. 'p': 'm',
  48. 'b': 'm',
  49. 'm': 'm',
  50. 't': 'n',
  51. 'd': 'n',
  52. 'n': 'n',
  53. 'ʧ': 'n^',
  54. 'ʥ': 'n^',
  55. 'k': 'ŋ',
  56. 'g': 'ŋ'
  57. }
  58. def symbols_to_japanese(text):
  59. for regex, replacement in _symbols_to_japanese:
  60. text = re.sub(regex, replacement, text)
  61. return text
  62. def japanese_to_romaji_with_accent(text):
  63. '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
  64. text = symbols_to_japanese(text)
  65. sentences = re.split(_japanese_marks, text)
  66. marks = re.findall(_japanese_marks, text)
  67. text = ''
  68. for i, sentence in enumerate(sentences):
  69. if re.match(_japanese_characters, sentence):
  70. if text != '':
  71. text += ' '
  72. labels = pyopenjtalk.extract_fullcontext(sentence)
  73. for n, label in enumerate(labels):
  74. phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
  75. if phoneme not in ['sil', 'pau']:
  76. text += phoneme.replace('ch', 'ʧ').replace('sh',
  77. 'ʃ').replace('cl', 'Q')
  78. else:
  79. continue
  80. # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
  81. a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
  82. a2 = int(re.search(r"\+(\d+)\+", label).group(1))
  83. a3 = int(re.search(r"\+(\d+)/", label).group(1))
  84. if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
  85. a2_next = -1
  86. else:
  87. a2_next = int(
  88. re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
  89. # Accent phrase boundary
  90. if a3 == 1 and a2_next == 1:
  91. text += ' '
  92. # Falling
  93. elif a1 == 0 and a2_next == a2 + 1:
  94. text += '↓'
  95. # Rising
  96. elif a2 == 1 and a2_next == 2:
  97. text += '↑'
  98. if i < len(marks):
  99. text += unidecode(marks[i]).replace(' ', '')
  100. return text
  101. def get_real_sokuon(text):
  102. text=re.sub('Q[↑↓]*(.)',lambda x:_real_sokuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_sokuon.keys() else x.group(0),text)
  103. return text
  104. def get_real_hatsuon(text):
  105. text=re.sub('N[↑↓]*(.)',lambda x:_real_hatsuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_hatsuon.keys() else x.group(0),text)
  106. return text
  107. def japanese_to_ipa(text):
  108. text=japanese_to_romaji_with_accent(text)
  109. for regex, replacement in _romaji_to_ipa:
  110. text = re.sub(regex, replacement, text)
  111. text = re.sub(
  112. r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
  113. text = get_real_sokuon(text)
  114. text = get_real_hatsuon(text)
  115. return text
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...