Register
Login
Resources
Docs Blog Datasets Glossary Case Studies Tutorials & Webinars
Product
Data Engine LLMs Platform Enterprise
Pricing Explore
Connect to our Discord channel

update_translations.py 10.0 KB

You have to be logged in to leave a comment. Sign In
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
  1. # Ultralytics YOLO 🚀, AGPL-3.0 license
  2. """
  3. Script to fix broken Markdown links and front matter in language-specific directories zh, ko, ja, ru, de, fr, es, pt.
  4. This script processes markdown files in language-specific directories (like /zh/). It finds Markdown links and checks
  5. their existence. If a link is broken and does not exist in the language-specific directory but exists in the /en/
  6. directory, the script updates the link to point to the corresponding file in the /en/ directory.
  7. It also ensures that front matter keywords like 'comments:', 'description:', and 'keywords:' are not translated and
  8. remain in English.
  9. """
  10. import re
  11. from pathlib import Path
  12. class MarkdownLinkFixer:
  13. """Class to fix Markdown links and front matter in language-specific directories."""
  14. def __init__(self, base_dir, update_links=True, update_text=True):
  15. """Initialize the MarkdownLinkFixer with the base directory."""
  16. self.base_dir = Path(base_dir)
  17. self.update_links = update_links
  18. self.update_text = update_text
  19. self.md_link_regex = re.compile(r'\[([^]]+)]\(([^:)]+)\.md\)')
  20. @staticmethod
  21. def replace_front_matter(content, lang_dir):
  22. """Ensure front matter keywords remain in English."""
  23. english = ['comments', 'description', 'keywords']
  24. translations = {
  25. 'zh': ['评论', '描述', '关键词'], # Mandarin Chinese (Simplified) warning, sometimes translates as 关键字
  26. 'es': ['comentarios', 'descripción', 'palabras clave'], # Spanish
  27. 'ru': ['комментарии', 'описание', 'ключевые слова'], # Russian
  28. 'pt': ['comentários', 'descrição', 'palavras-chave'], # Portuguese
  29. 'fr': ['commentaires', 'description', 'mots-clés'], # French
  30. 'de': ['kommentare', 'beschreibung', 'schlüsselwörter'], # German
  31. 'ja': ['コメント', '説明', 'キーワード'], # Japanese
  32. 'ko': ['댓글', '설명', '키워드'], # Korean
  33. 'hi': ['टिप्पणियाँ', 'विवरण', 'कीवर्ड'], # Hindi
  34. 'ar': ['التعليقات', 'الوصف', 'الكلمات الرئيسية'] # Arabic
  35. } # front matter translations for comments, description, keyword
  36. for term, eng_key in zip(translations.get(lang_dir.stem, []), english):
  37. content = re.sub(rf'{term} *[::].*', f'{eng_key}: true', content, flags=re.IGNORECASE) if \
  38. eng_key == 'comments' else re.sub(rf'{term} *[::] *', f'{eng_key}: ', content, flags=re.IGNORECASE)
  39. return content
  40. @staticmethod
  41. def replace_admonitions(content, lang_dir):
  42. """Ensure front matter keywords remain in English."""
  43. english = [
  44. 'Note', 'Summary', 'Tip', 'Info', 'Success', 'Question', 'Warning', 'Failure', 'Danger', 'Bug', 'Example',
  45. 'Quote', 'Abstract', 'Seealso', 'Admonition']
  46. translations = {
  47. 'en':
  48. english,
  49. 'zh': ['笔记', '摘要', '提示', '信息', '成功', '问题', '警告', '失败', '危险', '故障', '示例', '引用', '摘要', '另见', '警告'],
  50. 'es': [
  51. 'Nota', 'Resumen', 'Consejo', 'Información', 'Éxito', 'Pregunta', 'Advertencia', 'Fracaso', 'Peligro',
  52. 'Error', 'Ejemplo', 'Cita', 'Abstracto', 'Véase También', 'Amonestación'],
  53. 'ru': [
  54. 'Заметка', 'Сводка', 'Совет', 'Информация', 'Успех', 'Вопрос', 'Предупреждение', 'Неудача', 'Опасность',
  55. 'Ошибка', 'Пример', 'Цитата', 'Абстракт', 'См. Также', 'Предостережение'],
  56. 'pt': [
  57. 'Nota', 'Resumo', 'Dica', 'Informação', 'Sucesso', 'Questão', 'Aviso', 'Falha', 'Perigo', 'Bug',
  58. 'Exemplo', 'Citação', 'Abstrato', 'Veja Também', 'Advertência'],
  59. 'fr': [
  60. 'Note', 'Résumé', 'Conseil', 'Info', 'Succès', 'Question', 'Avertissement', 'Échec', 'Danger', 'Bug',
  61. 'Exemple', 'Citation', 'Abstrait', 'Voir Aussi', 'Admonestation'],
  62. 'de': [
  63. 'Hinweis', 'Zusammenfassung', 'Tipp', 'Info', 'Erfolg', 'Frage', 'Warnung', 'Ausfall', 'Gefahr',
  64. 'Fehler', 'Beispiel', 'Zitat', 'Abstrakt', 'Siehe Auch', 'Ermahnung'],
  65. 'ja': ['ノート', '要約', 'ヒント', '情報', '成功', '質問', '警告', '失敗', '危険', 'バグ', '例', '引用', '抄録', '参照', '訓告'],
  66. 'ko': ['노트', '요약', '팁', '정보', '성공', '질문', '경고', '실패', '위험', '버그', '예제', '인용', '추상', '참조', '경고'],
  67. 'hi': [
  68. 'नोट', 'सारांश', 'सुझाव', 'जानकारी', 'सफलता', 'प्रश्न', 'चेतावनी', 'विफलता', 'खतरा', 'बग', 'उदाहरण',
  69. 'उद्धरण', 'सार', 'देखें भी', 'आगाही'],
  70. 'ar': [
  71. 'ملاحظة', 'ملخص', 'نصيحة', 'معلومات', 'نجاح', 'سؤال', 'تحذير', 'فشل', 'خطر', 'عطل', 'مثال', 'اقتباس',
  72. 'ملخص', 'انظر أيضاً', 'تحذير']}
  73. for term, eng_key in zip(translations.get(lang_dir.stem, []), english):
  74. if lang_dir.stem != 'en':
  75. content = re.sub(rf'!!! *{eng_key} *\n', f'!!! {eng_key} "{term}"\n', content, flags=re.IGNORECASE)
  76. content = re.sub(rf'!!! *{term} *\n', f'!!! {eng_key} "{term}"\n', content, flags=re.IGNORECASE)
  77. content = re.sub(rf'!!! *{term}', f'!!! {eng_key}', content, flags=re.IGNORECASE)
  78. content = re.sub(r'!!! *"', '!!! Example "', content, flags=re.IGNORECASE)
  79. return content
  80. @staticmethod
  81. def update_iframe(content):
  82. """Update the 'allow' attribute of iframe if it does not contain the specific English permissions."""
  83. english = 'accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share'
  84. pattern = re.compile(f'allow="(?!{re.escape(english)}).+?"')
  85. return pattern.sub(f'allow="{english}"', content)
  86. def link_replacer(self, match, parent_dir, lang_dir, use_abs_link=False):
  87. """Replace broken links with corresponding links in the /en/ directory."""
  88. text, path = match.groups()
  89. linked_path = (parent_dir / path).resolve().with_suffix('.md')
  90. if not linked_path.exists():
  91. en_linked_path = Path(str(linked_path).replace(str(lang_dir), str(lang_dir.parent / 'en')))
  92. if en_linked_path.exists():
  93. if use_abs_link:
  94. # Use absolute links WARNING: BUGS, DO NOT USE
  95. docs_root_relative_path = en_linked_path.relative_to(lang_dir.parent)
  96. updated_path = str(docs_root_relative_path).replace('en/', '/../')
  97. else:
  98. # Use relative links
  99. steps_up = len(parent_dir.relative_to(self.base_dir).parts)
  100. updated_path = Path('../' * steps_up) / en_linked_path.relative_to(self.base_dir)
  101. updated_path = str(updated_path).replace('/en/', '/')
  102. print(f"Redirecting link '[{text}]({path})' from {parent_dir} to {updated_path}")
  103. return f'[{text}]({updated_path})'
  104. else:
  105. print(f"Warning: Broken link '[{text}]({path})' found in {parent_dir} does not exist in /docs/en/.")
  106. return match.group(0)
  107. @staticmethod
  108. def update_html_tags(content):
  109. """Updates HTML tags in docs."""
  110. alt_tag = 'MISSING'
  111. # Remove closing slashes from self-closing HTML tags
  112. pattern = re.compile(r'<([^>]+?)\s*/>')
  113. content = re.sub(pattern, r'<\1>', content)
  114. # Find all images without alt tags and add placeholder alt text
  115. pattern = re.compile(r'!\[(.*?)\]\((.*?)\)')
  116. content, num_replacements = re.subn(pattern, lambda match: f'![{match.group(1) or alt_tag}]({match.group(2)})',
  117. content)
  118. # Add missing alt tags to HTML images
  119. pattern = re.compile(r'<img\s+(?!.*?\balt\b)[^>]*src=["\'](.*?)["\'][^>]*>')
  120. content, num_replacements = re.subn(pattern, lambda match: match.group(0).replace('>', f' alt="{alt_tag}">', 1),
  121. content)
  122. return content
  123. def process_markdown_file(self, md_file_path, lang_dir):
  124. """Process each markdown file in the language directory."""
  125. print(f'Processing file: {md_file_path}')
  126. with open(md_file_path, encoding='utf-8') as file:
  127. content = file.read()
  128. if self.update_links:
  129. content = self.md_link_regex.sub(lambda m: self.link_replacer(m, md_file_path.parent, lang_dir), content)
  130. if self.update_text:
  131. content = self.replace_front_matter(content, lang_dir)
  132. content = self.replace_admonitions(content, lang_dir)
  133. content = self.update_iframe(content)
  134. content = self.update_html_tags(content)
  135. with open(md_file_path, 'w', encoding='utf-8') as file:
  136. file.write(content)
  137. def process_language_directory(self, lang_dir):
  138. """Process each language-specific directory."""
  139. print(f'Processing language directory: {lang_dir}')
  140. for md_file in lang_dir.rglob('*.md'):
  141. self.process_markdown_file(md_file, lang_dir)
  142. def run(self):
  143. """Run the link fixing and front matter updating process for each language-specific directory."""
  144. for subdir in self.base_dir.iterdir():
  145. if subdir.is_dir() and re.match(r'^\w\w$', subdir.name):
  146. self.process_language_directory(subdir)
  147. if __name__ == '__main__':
  148. # Set the path to your MkDocs 'docs' directory here
  149. docs_dir = str(Path(__file__).parent.resolve())
  150. fixer = MarkdownLinkFixer(docs_dir, update_links=True, update_text=True)
  151. fixer.run()
Tip!

Press p or to see the previous file or, n or to see the next file

Comments

Loading...