1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
|
- import re
- from unidecode import unidecode
- import pyopenjtalk
- # Regular expression matching Japanese without punctuation marks:
- _japanese_characters = re.compile(
- r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
- # Regular expression matching non-Japanese characters or punctuation marks:
- _japanese_marks = re.compile(
- r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
- # List of (symbol, Japanese) pairs for marks:
- _symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
- ('%', 'パーセント')
- ]]
- # List of (romaji, ipa) pairs for marks:
- _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
- ('ts', 'ʦ'),
- ('u', 'ɯ'),
- ('...', '…'),
- ('j', 'ʥ'),
- ('y', 'j'),
- ('ni', 'n^i'),
- ('nj', 'n^'),
- ('hi', 'çi'),
- ('hj', 'ç'),
- ('f', 'ɸ'),
- ('I', 'i*'),
- ('U', 'ɯ*'),
- ('r', 'ɾ')
- ]]
- # Dictinary of (consonant, sokuon) pairs:
- _real_sokuon = {
- 'k': 'k#',
- 'g': 'k#',
- 't': 't#',
- 'd': 't#',
- 'ʦ': 't#',
- 'ʧ': 't#',
- 'ʥ': 't#',
- 'j': 't#',
- 's': 's',
- 'ʃ': 's',
- 'p': 'p#',
- 'b': 'p#'
- }
- # Dictinary of (consonant, hatsuon) pairs:
- _real_hatsuon = {
- 'p': 'm',
- 'b': 'm',
- 'm': 'm',
- 't': 'n',
- 'd': 'n',
- 'n': 'n',
- 'ʧ': 'n^',
- 'ʥ': 'n^',
- 'k': 'ŋ',
- 'g': 'ŋ'
- }
- def symbols_to_japanese(text):
- for regex, replacement in _symbols_to_japanese:
- text = re.sub(regex, replacement, text)
- return text
- def japanese_to_romaji_with_accent(text):
- '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
- text = symbols_to_japanese(text)
- sentences = re.split(_japanese_marks, text)
- marks = re.findall(_japanese_marks, text)
- text = ''
- for i, sentence in enumerate(sentences):
- if re.match(_japanese_characters, sentence):
- if text != '':
- text += ' '
- labels = pyopenjtalk.extract_fullcontext(sentence)
- for n, label in enumerate(labels):
- phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
- if phoneme not in ['sil', 'pau']:
- text += phoneme.replace('ch', 'ʧ').replace('sh',
- 'ʃ').replace('cl', 'Q')
- else:
- continue
- # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
- a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
- a2 = int(re.search(r"\+(\d+)\+", label).group(1))
- a3 = int(re.search(r"\+(\d+)/", label).group(1))
- if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
- a2_next = -1
- else:
- a2_next = int(
- re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
- # Accent phrase boundary
- if a3 == 1 and a2_next == 1:
- text += ' '
- # Falling
- elif a1 == 0 and a2_next == a2 + 1:
- text += '↓'
- # Rising
- elif a2 == 1 and a2_next == 2:
- text += '↑'
- if i < len(marks):
- text += unidecode(marks[i]).replace(' ', '')
- return text
- def get_real_sokuon(text):
- text=re.sub('Q[↑↓]*(.)',lambda x:_real_sokuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_sokuon.keys() else x.group(0),text)
- return text
- def get_real_hatsuon(text):
- text=re.sub('N[↑↓]*(.)',lambda x:_real_hatsuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_hatsuon.keys() else x.group(0),text)
- return text
- def japanese_to_ipa(text):
- text=japanese_to_romaji_with_accent(text)
- for regex, replacement in _romaji_to_ipa:
- text = re.sub(regex, replacement, text)
- text = re.sub(
- r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
- text = get_real_sokuon(text)
- text = get_real_hatsuon(text)
- return text
|