ClickHouse/docs/tools/single_page.py

231 lines
8.8 KiB
Python
Raw Normal View History

import logging
import os
import re
import shutil
import subprocess
import yaml
import bs4
import mkdocs.commands.build
import test
import util
import website
def recursive_values(item):
if isinstance(item, dict):
2020-10-02 16:54:07 +00:00
for _, value in list(item.items()):
yield from recursive_values(value)
elif isinstance(item, list):
for value in item:
yield from recursive_values(value)
elif isinstance(item, str):
yield item
2021-03-14 13:31:16 +00:00
anchor_not_allowed_chars = re.compile(r'[^\w\-]')
def generate_anchor_from_path(path):
return re.sub(anchor_not_allowed_chars, '-', path)
2021-03-15 11:56:04 +00:00
absolute_link = re.compile(r'^https?://')
2021-03-14 13:31:16 +00:00
def replace_link(match, path):
2021-03-15 11:27:49 +00:00
title = match.group(1)
link = match.group(2)
# Not a relative link
2021-03-15 11:56:04 +00:00
if re.search(absolute_link, link):
2021-03-15 11:27:49 +00:00
return match.group(0)
2021-03-14 13:31:16 +00:00
if link.endswith('/'):
link = link[0:-1] + '.md'
2021-03-15 11:28:37 +00:00
return '{}(#{})'.format(title, generate_anchor_from_path(os.path.normpath(os.path.join(os.path.dirname(path), link))))
2021-03-14 13:31:16 +00:00
# Concatenates Markdown files to a single file.
def concatenate(lang, docs_path, single_page_file, nav):
lang_path = os.path.join(docs_path, lang)
proj_config = f'{docs_path}/toc_{lang}.yml'
if os.path.exists(proj_config):
with open(proj_config) as cfg_file:
nav = yaml.full_load(cfg_file.read())['nav']
2021-03-14 13:31:16 +00:00
files_to_concatenate = list(recursive_values(nav))
files_count = len(files_to_concatenate)
logging.info(f'{files_count} files will be concatenated into single md-file for {lang}.')
logging.debug('Concatenating: ' + ', '.join(files_to_concatenate))
assert files_count > 0, f'Empty single-page for {lang}'
2021-03-15 11:27:49 +00:00
link_regexp = re.compile(r'(\[[^\]]+\])\(([^)#]+)(?:#[^\)]+)?\)')
2021-03-14 13:31:16 +00:00
for path in files_to_concatenate:
try:
with open(os.path.join(lang_path, path)) as f:
2021-03-14 13:31:16 +00:00
# Insert a horizontal ruler. Then insert an anchor that we will link to. Its name will be a path to the .md file.
single_page_file.write('\n______\n<a name="%s"></a>\n' % generate_anchor_from_path(path))
in_metadata = False
2021-03-14 13:31:16 +00:00
for line in f:
# Skip YAML metadata.
if line == '---\n':
in_metadata = not in_metadata
2021-03-14 13:31:16 +00:00
continue
if not in_metadata:
2021-03-14 13:31:16 +00:00
# Increase the level of headers.
if line.startswith('#'):
line = '#' + line
# Replace links within the docs.
2021-03-15 11:27:49 +00:00
if re.search(link_regexp, line):
2021-03-14 13:31:16 +00:00
line = re.sub(
2021-03-15 11:27:49 +00:00
link_regexp,
2021-03-14 13:31:16 +00:00
lambda match: replace_link(match, path),
line)
# If failed to replace the relative link, print to log
if '../' in line:
logging.info('Failed to resolve relative link:')
logging.info(path)
logging.info(line)
single_page_file.write(line)
except IOError as e:
logging.warning(str(e))
single_page_file.flush()
def build_single_page_version(lang, args, nav, cfg):
logging.info(f'Building single page version for {lang}')
os.environ['SINGLE_PAGE'] = '1'
extra = cfg.data['extra']
extra['single_page'] = True
extra['is_amp'] = False
2021-04-13 20:44:34 +00:00
single_md_path = os.path.join(args.docs_dir, lang, 'single.md')
with open(single_md_path, 'w') as single_md:
concatenate(lang, args.docs_dir, single_md, nav)
with util.temp_dir() as site_temp:
with util.temp_dir() as docs_temp:
docs_src_lang = os.path.join(args.docs_dir, lang)
docs_temp_lang = os.path.join(docs_temp, lang)
shutil.copytree(docs_src_lang, docs_temp_lang)
for root, _, filenames in os.walk(docs_temp_lang):
for filename in filenames:
if filename != 'single.md' and filename.endswith('.md'):
os.unlink(os.path.join(root, filename))
cfg.load_dict({
'docs_dir': docs_temp_lang,
'site_dir': site_temp,
'extra': extra,
'nav': [
{cfg.data.get('site_name'): 'single.md'}
]
})
if not args.test_only:
mkdocs.commands.build.build(cfg)
2020-12-21 20:04:22 +00:00
single_page_output_path = os.path.join(args.docs_dir, args.docs_output_dir, lang, 'single')
if os.path.exists(single_page_output_path):
shutil.rmtree(single_page_output_path)
shutil.copytree(
os.path.join(site_temp, 'single'),
single_page_output_path
)
single_page_index_html = os.path.join(single_page_output_path, 'index.html')
single_page_content_js = os.path.join(single_page_output_path, 'content.js')
2021-03-14 13:31:16 +00:00
with open(single_page_index_html, 'r') as f:
sp_prefix, sp_js, sp_suffix = f.read().split('<!-- BREAK -->')
2021-03-14 13:31:16 +00:00
with open(single_page_index_html, 'w') as f:
f.write(sp_prefix)
f.write(sp_suffix)
2021-03-14 13:31:16 +00:00
with open(single_page_content_js, 'w') as f:
if args.minify:
import jsmin
sp_js = jsmin.jsmin(sp_js)
f.write(sp_js)
logging.info(f'Re-building single page for {lang} pdf/test')
with util.temp_dir() as test_dir:
extra['single_page'] = False
cfg.load_dict({
'docs_dir': docs_temp_lang,
'site_dir': test_dir,
'extra': extra,
'nav': [
{cfg.data.get('site_name'): 'single.md'}
]
})
mkdocs.commands.build.build(cfg)
css_in = ' '.join(website.get_css_in(args))
js_in = ' '.join(website.get_js_in(args))
subprocess.check_call(f'cat {css_in} > {test_dir}/css/base.css', shell=True)
subprocess.check_call(f'cat {js_in} > {test_dir}/js/base.js', shell=True)
2021-03-14 13:31:16 +00:00
if args.save_raw_single_page:
shutil.copytree(test_dir, args.save_raw_single_page)
2020-12-21 20:04:22 +00:00
logging.info(f'Running tests for {lang}')
test.test_single_page(
os.path.join(test_dir, 'single', 'index.html'), lang)
if not args.skip_pdf:
single_page_index_html = os.path.join(test_dir, 'single', 'index.html')
single_page_pdf = os.path.abspath(
os.path.join(single_page_output_path, f'clickhouse_{lang}.pdf')
)
with open(single_page_index_html, 'r') as f:
soup = bs4.BeautifulSoup(
f.read(),
features='html.parser'
)
soup_prefix = f'file://{test_dir}'
for img in soup.findAll('img'):
if img['src'].startswith('/'):
img['src'] = soup_prefix + img['src']
for script in soup.findAll('script'):
script_src = script.get('src')
if script_src:
script['src'] = soup_prefix + script_src.split('?', 1)[0]
for link in soup.findAll('link'):
link['href'] = soup_prefix + link['href'].split('?', 1)[0]
with open(single_page_index_html, 'w') as f:
f.write(str(soup))
create_pdf_command = [
'wkhtmltopdf',
'--print-media-type',
'--log-level', 'warn',
single_page_index_html, single_page_pdf
]
logging.info(' '.join(create_pdf_command))
2021-09-20 10:40:21 +00:00
try:
subprocess.check_call(' '.join(create_pdf_command), shell=True)
except:
pass # TODO: fix pdf issues
logging.info(f'Finished building single page version for {lang}')
2021-09-20 06:51:41 +00:00
2021-04-13 20:44:34 +00:00
if os.path.exists(single_md_path):
os.unlink(single_md_path)