ClickHouse/docs/tools/translate/filter.py
Ivan Blinkov 2e1f6bc56d
[experimental] add "es" docs language as machine translated draft (#9787)
* replace exit with assert in test_single_page

* improve save_raw_single_page docs option

* More grammar fixes

* "Built from" link in new tab

* fix mistype

* Example of include in docs

* add anchor to meeting form

* Draft of translation helper

* WIP on translation helper

* Replace some fa docs content with machine translation

* add normalize-en-markdown.sh

* normalize some en markdown

* normalize some en markdown

* admonition support

* normalize

* normalize

* normalize

* support wide tables

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* normalize

* lightly edited machine translation of introdpection.md

* lightly edited machhine translation of lazy.md

* WIP on translation utils

* Normalize ru docs

* Normalize other languages

* some fixes

* WIP on normalize/translate tools

* add requirements.txt

* [experimental] add es docs language as machine translated draft

* remove duplicate script

* Back to wider tab-stop (narrow renders not so well)
2020-03-21 07:11:51 +03:00

153 lines
4.8 KiB
Python
Executable File

#!/usr/bin/env python3
import os
import sys
import json.decoder
import pandocfilters
import slugify
import translate
is_debug = os.environ.get('DEBUG') is not None
def debug(*args):
if is_debug:
print(*args, file=sys.stderr)
def process_buffer(buffer, new_value, item=None):
if buffer:
text = ''.join(buffer)
try:
translated_text = translate.translate(text)
except TypeError:
translated_text = text
except json.decoder.JSONDecodeError as e:
print('Failed to translate', str(e), file=sys.stderr)
sys.exit(1)
debug('Translate', text, ' -> ', translated_text)
if text and text[0].isupper() and not translated_text[0].isupper():
translated_text = translated_text[0].upper() + translated_text[1:]
if text.startswith(' ') and not translated_text.startswith(' '):
translated_text = ' ' + translated_text
if text.endswith(' ') and not translated_text.endswith(' '):
translated_text = translated_text + ' '
for token in translated_text.split(' '):
new_value.append(pandocfilters.Str(token))
new_value.append(pandocfilters.Space())
if item is None and len(new_value):
new_value.pop(len(new_value) - 1)
else:
new_value[-1] = item
elif item:
new_value.append(item)
def process_sentence(value):
new_value = []
buffer = []
for item in value:
if isinstance(item, list):
new_value.append([process_sentence(subitem) for subitem in item])
continue
elif isinstance(item, dict):
t = item.get('t')
c = item.get('c')
if t == 'Str':
buffer.append(c)
elif t == 'Space':
buffer.append(' ')
elif t == 'DoubleQuote':
buffer.append('"')
else:
process_buffer(buffer, new_value, item)
buffer = []
else:
new_value.append(item)
process_buffer(buffer, new_value)
return new_value
def translate_filter(key, value, _format, _):
if key not in ['Space', 'Str']:
debug(key, value)
try:
cls = getattr(pandocfilters, key)
except AttributeError:
return
if key == 'Para' and value:
marker = value[0].get('c')
if isinstance(marker, str) and marker.startswith('!!!') and len(value) > 2:
# Admonition case
if marker != '!!!':
# Lost space after !!! case
value.insert(1, pandocfilters.Str(marker[3:]))
value.insert(1, pandocfilters.Space())
value[0]['c'] = '!!!'
admonition_value = []
remaining_para_value = []
in_admonition = True
for item in value:
if in_admonition:
if item.get('t') == 'SoftBreak':
in_admonition = False
else:
admonition_value.append(item)
else:
remaining_para_value.append(item)
break_value = [pandocfilters.LineBreak(),pandocfilters.Str(' ' * 4)]
if admonition_value[-1].get('t') == 'Quoted':
text = process_sentence(admonition_value[-1]['c'][-1])
text[0]['c'] = '"' + text[0]['c']
text[-1]['c'] = text[-1]['c'] + '"'
admonition_value.pop(-1)
admonition_value += text
else:
debug('>>>', )
text = admonition_value[-1].get('c')
if text:
text = translate(text[0].upper() + text[1:])
admonition_value.append(pandocfilters.Space())
admonition_value.append(pandocfilters.Str(f'"{text}"'))
return cls(admonition_value + break_value + process_sentence(remaining_para_value))
else:
return cls(process_sentence(value))
elif key == 'Plain' or key == 'Strong' or key == 'Emph':
return cls(process_sentence(value))
elif key == 'Link':
try:
# Plain links case
if value[2][0] == value[1][0].get('c'):
return pandocfilters.Str(value[2][0])
except IndexError:
pass
value[1] = process_sentence(value[1])
return cls(*value)
elif key == 'Header':
# TODO: title case header in en
value[1][0] = slugify.slugify(value[1][0], separator='-', word_boundary=True, save_order=True)
# TODO: title case header in en
value[2] = process_sentence(value[2])
return cls(*value)
elif key == 'SoftBreak':
return pandocfilters.LineBreak()
return
if __name__ == "__main__":
pandocfilters.toJSONFilter(translate_filter)