import concurrent.futures import hashlib import json import logging import os import shutil import subprocess import sys import bs4 import closure import cssmin import htmlmin import jsmin import util def handle_iframe(iframe, soup): allowed_domains = ['https://www.youtube.com/', 'https://datalens.yandex/'] illegal_domain = True iframe_src = iframe.attrs['src'] for domain in allowed_domains: if iframe_src.startswith(domain): illegal_domain = False break if illegal_domain: raise RuntimeError(f'iframe from illegal domain: {iframe_src}') wrapper = soup.new_tag('div') wrapper.attrs['class'] = ['embed-responsive', 'embed-responsive-16by9'] iframe.insert_before(wrapper) iframe.extract() wrapper.insert(0, iframe) if 'width' in iframe.attrs: del iframe.attrs['width'] if 'height' in iframe.attrs: del iframe.attrs['height'] iframe.attrs['allow'] = 'accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture' iframe.attrs['class'] = 'embed-responsive-item' iframe.attrs['frameborder'] = '0' iframe.attrs['allowfullscreen'] = '1' def adjust_markdown_html(content): soup = bs4.BeautifulSoup( content, features='html.parser' ) for a in soup.find_all('a'): a_class = a.attrs.get('class') a_href = a.attrs.get('href') if a_class and 'headerlink' in a_class: a.string = '\xa0' if a_href and a_href.startswith('http'): a.attrs['target'] = '_blank' for code in soup.find_all('code'): code_class = code.attrs.get('class') if code_class: code.attrs['class'] = code_class + ['syntax'] else: code.attrs['class'] = 'syntax' for iframe in soup.find_all('iframe'): handle_iframe(iframe, soup) for img in soup.find_all('img'): if img.attrs.get('alt') == 'iframe': img.name = 'iframe' img.string = '' handle_iframe(img, soup) continue img_class = img.attrs.get('class') if img_class: img.attrs['class'] = img_class + ['img-fluid'] else: img.attrs['class'] = 'img-fluid' for details in soup.find_all('details'): for summary in details.find_all('summary'): if summary.parent != details: summary.extract() details.insert(0, summary) for dd in soup.find_all('dd'): dd_class = dd.attrs.get('class') if dd_class: dd.attrs['class'] = dd_class + ['pl-3'] else: dd.attrs['class'] = 'pl-3' for div in soup.find_all('div'): div_class = div.attrs.get('class') is_admonition = div_class and 'admonition' in div.attrs.get('class') if is_admonition: for a in div.find_all('a'): a_class = a.attrs.get('class') if a_class: a.attrs['class'] = a_class + ['alert-link'] else: a.attrs['class'] = 'alert-link' for p in div.find_all('p'): p_class = p.attrs.get('class') if is_admonition and p_class and ('admonition-title' in p_class): p.attrs['class'] = p_class + ['alert-heading', 'display-4', 'text-reset', 'mb-2'] if is_admonition: div.attrs['role'] = 'alert' if ('info' in div_class) or ('note' in div_class): mode = 'alert-primary' elif ('attention' in div_class) or ('warning' in div_class): mode = 'alert-warning' elif 'important' in div_class: mode = 'alert-danger' elif 'tip' in div_class: mode = 'alert-info' else: mode = 'alert-secondary' div.attrs['class'] = div_class + ['alert', 'pb-0', 'mb-4', mode] return str(soup) def minify_html(content): return htmlmin.minify(content) def build_website(args): logging.info('Building website') env = util.init_jinja2_env(args) shutil.copytree( args.website_dir, args.output_dir, ignore=shutil.ignore_patterns( '*.md', '*.sh', '*.css', '*.json', 'js/*.js', 'build', 'docs', 'public', 'node_modules', 'src', 'templates', 'locale', '.gitkeep' ) ) # This file can be requested to check for available ClickHouse releases. shutil.copy2( os.path.join(args.src_dir, 'utils', 'list-versions', 'version_date.tsv'), os.path.join(args.output_dir, 'data', 'version_date.tsv')) # This file can be requested to install ClickHouse. shutil.copy2( os.path.join(args.src_dir, 'docs', '_includes', 'install', 'universal.sh'), os.path.join(args.output_dir, 'data', 'install.sh')) for root, _, filenames in os.walk(args.output_dir): for filename in filenames: if filename == 'main.html': continue path = os.path.join(root, filename) if not filename.endswith('.html'): continue logging.info('Processing %s', path) with open(path, 'rb') as f: content = f.read().decode('utf-8') template = env.from_string(content) content = template.render(args.__dict__) with open(path, 'wb') as f: f.write(content.encode('utf-8')) def get_css_in(args): return [ f"'{args.website_dir}/css/bootstrap.css'", f"'{args.website_dir}/css/docsearch.css'", f"'{args.website_dir}/css/base.css'", f"'{args.website_dir}/css/blog.css'", f"'{args.website_dir}/css/docs.css'", f"'{args.website_dir}/css/highlight.css'", f"'{args.website_dir}/css/main.css'" ] def get_js_in(args): return [ f"'{args.website_dir}/js/jquery.js'", f"'{args.website_dir}/js/popper.js'", f"'{args.website_dir}/js/bootstrap.js'", f"'{args.website_dir}/js/sentry.js'", f"'{args.website_dir}/js/base.js'", f"'{args.website_dir}/js/index.js'", f"'{args.website_dir}/js/docsearch.js'", f"'{args.website_dir}/js/docs.js'", f"'{args.website_dir}/js/main.js'" ] def minify_file(path, css_digest, js_digest): if not ( path.endswith('.html') or path.endswith('.css') ): return logging.info('Minifying %s', path) with open(path, 'rb') as f: content = f.read().decode('utf-8') if path.endswith('.html'): content = minify_html(content) content = content.replace('base.css?css_digest', f'base.css?{css_digest}') content = content.replace('base.js?js_digest', f'base.js?{js_digest}') # TODO: restore cssmin # elif path.endswith('.css'): # content = cssmin.cssmin(content) # TODO: restore jsmin # elif path.endswith('.js'): # content = jsmin.jsmin(content) with open(path, 'wb') as f: f.write(content.encode('utf-8')) def minify_website(args): # Output greenhouse css separately from main bundle to be included via the greenhouse iframe command = f"cat '{args.website_dir}/css/greenhouse.css' > '{args.output_dir}/css/greenhouse.css'" logging.info(command) output = subprocess.check_output(command, shell=True) logging.debug(output) css_in = ' '.join(get_css_in(args)) css_out = f'{args.output_dir}/css/base.css' if args.minify: command = f"purifycss -w '*algolia*' --min {css_in} '{args.output_dir}/*.html' " \ f"'{args.output_dir}/docs/en/**/*.html' '{args.website_dir}/js/**/*.js' > {css_out}" else: command = f'cat {css_in} > {css_out}' logging.info(command) output = subprocess.check_output(command, shell=True) logging.debug(output) with open(css_out, 'rb') as f: css_digest = hashlib.sha3_224(f.read()).hexdigest()[0:8] js_in = get_js_in(args) js_out = f'{args.output_dir}/js/base.js' if args.minify and False: # TODO: return closure js_in = [js[1:-1] for js in js_in] closure_args = [ '--js', *js_in, '--js_output_file', js_out, '--compilation_level', 'SIMPLE', '--dependency_mode', 'NONE', '--third_party', '--use_types_for_optimization', '--isolation_mode', 'IIFE' ] logging.info(closure_args) if closure.run(*closure_args): raise RuntimeError('failed to run closure compiler') with open(js_out, 'r') as f: js_content = jsmin.jsmin(f.read()) with open(js_out, 'w') as f: f.write(js_content) else: js_in = ' '.join(js_in) command = f'cat {js_in} > {js_out}' logging.info(command) output = subprocess.check_output(command, shell=True) logging.debug(output) with open(js_out, 'rb') as f: js_digest = hashlib.sha3_224(f.read()).hexdigest()[0:8] logging.info(js_digest) if args.minify: logging.info('Minifying website') with concurrent.futures.ThreadPoolExecutor() as executor: futures = [] for root, _, filenames in os.walk(args.output_dir): for filename in filenames: path = os.path.join(root, filename) futures.append(executor.submit(minify_file, path, css_digest, js_digest)) for future in futures: exc = future.exception() if exc: logging.error(exc) sys.exit(1) def process_benchmark_results(args): benchmark_root = os.path.join(args.website_dir, 'benchmark') required_keys = { 'dbms': ['result'], 'hardware': ['result', 'system', 'system_full', 'kind'] } for benchmark_kind in ['dbms', 'hardware']: results = [] results_root = os.path.join(benchmark_root, benchmark_kind, 'results') for result in sorted(os.listdir(results_root)): result_file = os.path.join(results_root, result) logging.debug(f'Reading benchmark result from {result_file}') with open(result_file, 'r') as f: result = json.loads(f.read()) for item in result: for required_key in required_keys[benchmark_kind]: assert required_key in item, f'No "{required_key}" in {result_file}' results += result results_js = os.path.join(args.output_dir, 'benchmark', benchmark_kind, 'results.js') with open(results_js, 'w') as f: data = json.dumps(results) f.write(f'var results = {data};')