Merge pull request #49701 from ClickHouse/fix-browser

Improve woboq codebrowser pipeline
2024-11-21 15:12:02 +00:00 · 2023-05-12 19:45:17 +02:00 · 2023-05-12 19:45:17 +02:00 · b1fd1d3ae6
commit b1fd1d3ae6
parent 6314813e7b 4b7aa30017
7 changed files with 154 additions and 60 deletions
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@ -72,6 +72,9 @@ jobs:
        with:
          name: changed_images
          path: ${{ runner.temp }}/changed_images.json
+  Codebrowser:
+    needs: [DockerHubPush]
+    uses: ./.github/workflows/woboq.yml
  BuilderCoverity:
    needs: DockerHubPush
    runs-on: [self-hosted, builder]
--- a/.github/workflows/woboq.yml
+++ b/.github/workflows/woboq.yml
@ -6,9 +6,8 @@ env:
 concurrency:
  group: woboq
 on: # yamllint disable-line rule:truthy
-  schedule:
-    - cron: '0 */18 * * *'
  workflow_dispatch:
+  workflow_call:
 jobs:
  # don't use dockerhub push because this image updates so rarely
  WoboqCodebrowser:
@ -26,6 +25,10 @@ jobs:
        with:
          clear-repository: true
          submodules: 'true'
+      - name: Download json reports
+        uses: actions/download-artifact@v3
+        with:
+          path: ${{ env.IMAGES_PATH }}
      - name: Codebrowser
        run: |
          sudo rm -fr "$TEMP_PATH"
--- a/docker/test/codebrowser/Dockerfile
+++ b/docker/test/codebrowser/Dockerfile
@ -20,26 +20,11 @@ RUN arch=${TARGETARCH:-amd64} \

 # repo versions doesn't work correctly with C++17
 # also we push reports to s3, so we add index.html to subfolder urls
-# https://github.com/ClickHouse-Extras/woboq_codebrowser/commit/37e15eaf377b920acb0b48dbe82471be9203f76b
-RUN git clone --depth=1 https://github.com/ClickHouse/woboq_codebrowser /woboq_codebrowser \
+# https://github.com/ClickHouse/woboq_codebrowser/commit/37e15eaf377b920acb0b48dbe82471be9203f76b
+RUN git clone --branch=master --depth=1 https://github.com/ClickHouse/woboq_codebrowser /woboq_codebrowser \
  && cd /woboq_codebrowser \
  && cmake . -G Ninja -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_COMPILER=clang\+\+-${LLVM_VERSION} -DCMAKE_C_COMPILER=clang-${LLVM_VERSION} -DCLANG_BUILTIN_HEADERS_DIR=/usr/lib/llvm-${LLVM_VERSION}/lib/clang/${LLVM_VERSION}/include \
  && ninja

-ENV CODEGEN=/woboq_codebrowser/generator/codebrowser_generator
-ENV CODEINDEX=/woboq_codebrowser/indexgenerator/codebrowser_indexgenerator
-ENV STATIC_DATA=/woboq_codebrowser/data
-
-ENV SOURCE_DIRECTORY=/repo_folder
-ENV BUILD_DIRECTORY=/build
-ENV HTML_RESULT_DIRECTORY=$BUILD_DIRECTORY/html_report
-ENV SHA=nosha
-ENV DATA="https://s3.amazonaws.com/clickhouse-test-reports/codebrowser/data"
-
-CMD mkdir -p $BUILD_DIRECTORY && cd $BUILD_DIRECTORY && \
-    cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-${LLVM_VERSION} -DCMAKE_C_COMPILER=/usr/bin/clang-${LLVM_VERSION} -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DENABLE_EMBEDDED_COMPILER=0 -DENABLE_S3=0 && \
-    mkdir -p $HTML_RESULT_DIRECTORY && \
-    $CODEGEN -b $BUILD_DIRECTORY -a -o $HTML_RESULT_DIRECTORY -p ClickHouse:$SOURCE_DIRECTORY:$SHA -d $DATA | ts '%Y-%m-%d %H:%M:%S' && \
-    cp -r $STATIC_DATA $HTML_RESULT_DIRECTORY/ &&\
-    $CODEINDEX $HTML_RESULT_DIRECTORY -d "$DATA" | ts '%Y-%m-%d %H:%M:%S' && \
-    mv $HTML_RESULT_DIRECTORY /test_output
+COPY build.sh /
+CMD ["bash", "-c", "/build.sh 2>&1"]
--- a/docker/test/codebrowser/build.sh
+++ b/docker/test/codebrowser/build.sh
@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+set -x -e
+
+
+STATIC_DATA=${STATIC_DATA:-/woboq_codebrowser/data}
+SOURCE_DIRECTORY=${SOURCE_DIRECTORY:-/build}
+BUILD_DIRECTORY=${BUILD_DIRECTORY:-/workdir/build}
+OUTPUT_DIRECTORY=${OUTPUT_DIRECTORY:-/workdir/output}
+HTML_RESULT_DIRECTORY=${HTML_RESULT_DIRECTORY:-$OUTPUT_DIRECTORY/html_report}
+SHA=${SHA:-nosha}
+DATA=${DATA:-https://s3.amazonaws.com/clickhouse-test-reports/codebrowser/data}
+nproc=$(($(nproc) + 2)) # increase parallelism
+
+read -ra CMAKE_FLAGS <<< "${CMAKE_FLAGS:-}"
+
+mkdir -p "$BUILD_DIRECTORY" && cd "$BUILD_DIRECTORY"
+cmake "$SOURCE_DIRECTORY" -DCMAKE_CXX_COMPILER="/usr/bin/clang++-${LLVM_VERSION}" -DCMAKE_C_COMPILER="/usr/bin/clang-${LLVM_VERSION}" -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DENABLE_EMBEDDED_COMPILER=0 "${CMAKE_FLAGS[@]}"
+mkdir -p "$HTML_RESULT_DIRECTORY"
+echo 'Filter out too noisy "Error: filename" lines and keep them in full codebrowser_generator.log'
+/woboq_codebrowser/generator/codebrowser_generator -b "$BUILD_DIRECTORY" -a \
+  -o "$HTML_RESULT_DIRECTORY" --execute-concurrency="$nproc" -p "ClickHouse:$SOURCE_DIRECTORY:$SHA" \
+  -d "$DATA" \
+    |& ts '%Y-%m-%d %H:%M:%S' \
+    | tee "$OUTPUT_DIRECTORY/codebrowser_generator.log" \
+    | grep --line-buffered -v ':[0-9]* Error: '
+cp -r "$STATIC_DATA" "$HTML_RESULT_DIRECTORY/"
+/woboq_codebrowser/indexgenerator/codebrowser_indexgenerator "$HTML_RESULT_DIRECTORY" \
+  -d "$DATA" |& ts '%Y-%m-%d %H:%M:%S'
--- a/tests/ci/codebrowser_check.py
+++ b/tests/ci/codebrowser_check.py
@ -1,18 +1,19 @@
 #!/usr/bin/env python3


-import os
-import subprocess
 import logging
+import os
+from pathlib import Path

 from github import Github

 from commit_status_helper import get_commit, post_commit_status
-from docker_pull_helper import get_image_with_version
+from docker_pull_helper import get_image_with_version, DockerImage
 from env_helper import (
    IMAGES_PATH,
    REPO_COPY,
    S3_DOWNLOAD,
+    S3_BUILDS_BUCKET,
    S3_TEST_REPORTS_BUCKET,
    TEMP_PATH,
 )
@ -27,16 +28,24 @@ from upload_result_helper import upload_results
 NAME = "Woboq Build"


-def get_run_command(repo_path, output_path, image):
+def get_run_command(
+    repo_path: Path, output_path: Path, image: DockerImage, sha: str
+) -> str:
+    user = f"{os.geteuid()}:{os.getegid()}"
    cmd = (
-        "docker run " + f"--volume={repo_path}:/repo_folder "
-        f"--volume={output_path}:/test_output "
-        f"-e 'DATA={S3_DOWNLOAD}/{S3_TEST_REPORTS_BUCKET}/codebrowser/data' {image}"
+        f"docker run --rm --user={user} --volume={repo_path}:/build "
+        f"--volume={output_path}:/workdir/output --network=host "
+        # use sccache, https://github.com/KDAB/codebrowser/issues/111
+        f"-e SCCACHE_BUCKET='{S3_BUILDS_BUCKET}' "
+        "-e SCCACHE_S3_KEY_PREFIX=ccache/sccache "
+        '-e CMAKE_FLAGS="$CMAKE_FLAGS -DCOMPILER_CACHE=sccache" '
+        f"-e 'DATA={S3_DOWNLOAD}/{S3_TEST_REPORTS_BUCKET}/codebrowser/data' "
+        f"-e SHA={sha} {image}"
    )
    return cmd


-if __name__ == "__main__":
+def main():
    logging.basicConfig(level=logging.INFO)

    stopwatch = Stopwatch()
@ -44,48 +53,83 @@ if __name__ == "__main__":
    gh = Github(get_best_robot_token(), per_page=100)
    pr_info = PRInfo()
    commit = get_commit(gh, pr_info.sha)
+    temp_path = Path(TEMP_PATH)

-    if not os.path.exists(TEMP_PATH):
-        os.makedirs(TEMP_PATH)
+    if not temp_path.exists():
+        os.makedirs(temp_path)

    docker_image = get_image_with_version(IMAGES_PATH, "clickhouse/codebrowser")
    s3_helper = S3Helper()

-    result_path = os.path.join(TEMP_PATH, "result_path")
-    if not os.path.exists(result_path):
+    result_path = temp_path / "result_path"
+    if not result_path.exists():
        os.makedirs(result_path)

-    run_command = get_run_command(REPO_COPY, result_path, docker_image)
+    run_command = get_run_command(
+        Path(REPO_COPY), result_path, docker_image, pr_info.sha[:12]
+    )

    logging.info("Going to run codebrowser: %s", run_command)

-    run_log_path = os.path.join(TEMP_PATH, "run.log")
+    run_log_path = result_path / "run.log"

+    state = "success"
    with TeePopen(run_command, run_log_path) as process:
        retcode = process.wait()
        if retcode == 0:
            logging.info("Run successfully")
        else:
            logging.info("Run failed")
+            state = "failure"

-    subprocess.check_call(f"sudo chown -R ubuntu:ubuntu {TEMP_PATH}", shell=True)
-
-    report_path = os.path.join(result_path, "html_report")
+    report_path = result_path / "html_report"
    logging.info("Report path %s", report_path)
    s3_path_prefix = "codebrowser"
-    html_urls = s3_helper.fast_parallel_upload_dir(
-        report_path, s3_path_prefix, "clickhouse-test-reports"
-    )
+    if state == "success":
+        _ = s3_helper.fast_parallel_upload_dir(
+            report_path, s3_path_prefix, S3_TEST_REPORTS_BUCKET
+        )

    index_html = (
-        '<a href="{S3_DOWNLOAD}/{S3_TEST_REPORTS_BUCKET}/codebrowser/index.html">'
-        "HTML report</a>"
+        f'<a href="{S3_DOWNLOAD}/{S3_TEST_REPORTS_BUCKET}/codebrowser/index.html">'
+        "Generate codebrowser site</a>"
    )

-    test_result = TestResult(index_html, "Look at the report")
+    additional_logs = [path.absolute() for path in result_path.glob("*.log")]

-    report_url = upload_results(s3_helper, 0, pr_info.sha, [test_result], [], NAME)
+    test_results = [
+        TestResult(index_html, state, stopwatch.duration_seconds, additional_logs)
+    ]
+
+    # Check if the run log contains `FATAL Error:`, that means the code problem
+    stopwatch = Stopwatch()
+    fatal_error = "FATAL Error:"
+    logging.info("Search for '%s' in %s", fatal_error, run_log_path)
+    with open(run_log_path, "r", encoding="utf-8") as rlfd:
+        for line in rlfd.readlines():
+            if "FATAL Error:" in line:
+                logging.warning(
+                    "The line '%s' found, mark the run as failure", fatal_error
+                )
+                state = "failure"
+                test_results.append(
+                    TestResult(
+                        "Indexing error",
+                        state,
+                        stopwatch.duration_seconds,
+                        additional_logs,
+                    )
+                )
+                break
+
+    report_url = upload_results(
+        s3_helper, pr_info.number, pr_info.sha, test_results, [], NAME
+    )

    print(f"::notice ::Report url: {report_url}")

-    post_commit_status(commit, "success", report_url, "Report built", NAME, pr_info)
+    post_commit_status(commit, state, report_url, "Report built", NAME, pr_info)
+
+
+if __name__ == "__main__":
+    main()
--- a/tests/ci/env_helper.py
+++ b/tests/ci/env_helper.py
@ -1,3 +1,4 @@
+import logging
 import os
 from os import path as p

@ -65,6 +66,32 @@ def GITHUB_JOB_ID() -> str:
        ):
            _GITHUB_JOB_ID = "0"

+    # FIXME: until it's here, we can't move to reusable workflows
+    if not _GITHUB_JOB_URL:
+        # This is a terrible workaround for the case of another broken part of
+        # GitHub actions. For nested workflows it doesn't provide a proper GITHUB_JOB
+        # value, but only the final one. So, for `OriginalJob / NestedJob / FinalJob`
+        # full name, GITHUB_JOB contains only FinalJob
+        matched_jobs = []
+        for job in jobs:
+            nested_parts = job["name"].split(" / ")
+            if len(nested_parts) <= 1:
+                continue
+            if nested_parts[-1] == GITHUB_JOB:
+                matched_jobs.append(job)
+        if len(matched_jobs) == 1:
+            # The best case scenario
+            _GITHUB_JOB_ID = matched_jobs[0]["id"]
+            _GITHUB_JOB_URL = matched_jobs[0]["html_url"]
+            return _GITHUB_JOB_ID
+        if matched_jobs:
+            logging.error(
+                "We could not get the ID and URL for the current job name %s, there "
+                "are more than one jobs match it for the nested workflows. Please, "
+                "refer to https://github.com/actions/runner/issues/2577",
+                GITHUB_JOB,
+            )
+
    return _GITHUB_JOB_ID


--- a/tests/ci/s3_helper.py
+++ b/tests/ci/s3_helper.py
@ -6,8 +6,11 @@ import re
 import shutil
 import time
 from multiprocessing.dummy import Pool
+from pathlib import Path
+from typing import List, Union

 import boto3  # type: ignore
+import botocore  # type: ignore

 from env_helper import (
    S3_TEST_REPORTS_BUCKET,
@ -40,9 +43,12 @@ def _flatten_list(lst):


 class S3Helper:
+    max_pool_size = 100
+
    def __init__(self):
+        config = botocore.config.Config(max_pool_connections=self.max_pool_size)
        self.session = boto3.session.Session(region_name="us-east-1")
-        self.client = self.session.client("s3", endpoint_url=S3_URL)
+        self.client = self.session.client("s3", endpoint_url=S3_URL, config=config)
        self.host = S3_URL
        self.download_host = S3_DOWNLOAD

@ -124,7 +130,9 @@ class S3Helper:
        else:
            return S3Helper.copy_file_to_local(S3_BUILDS_BUCKET, file_path, s3_path)

-    def fast_parallel_upload_dir(self, dir_path, s3_dir_path, bucket_name):
+    def fast_parallel_upload_dir(
+        self, dir_path: Union[str, Path], s3_dir_path: str, bucket_name: str
+    ) -> List[str]:
        all_files = []

        for root, _, files in os.walk(dir_path):
@ -137,12 +145,12 @@ class S3Helper:
        t = time.time()
        sum_time = 0

-        def upload_task(file_path):
+        def upload_task(file_path: str) -> str:
            nonlocal counter
            nonlocal t
            nonlocal sum_time
            try:
-                s3_path = file_path.replace(dir_path, s3_dir_path)
+                s3_path = file_path.replace(str(dir_path), s3_dir_path)
                metadata = {}
                if s3_path.endswith("html"):
                    metadata["ContentType"] = "text/html; charset=utf-8"
@ -167,25 +175,20 @@ class S3Helper:
                if counter % 1000 == 0:
                    sum_time += int(time.time() - t)
                    print(
-                        "Uploaded",
-                        counter,
-                        "-",
-                        int(time.time() - t),
-                        "s",
-                        "sum time",
-                        sum_time,
-                        "s",
+                        f"Uploaded {counter}, {int(time.time()-t)}s, "
+                        f"sum time {sum_time}s",
                    )
                    t = time.time()
            except Exception as ex:
                logging.critical("Failed to upload file, expcetion %s", ex)
            return f"{self.download_host}/{bucket_name}/{s3_path}"

-        p = Pool(256)
+        p = Pool(self.max_pool_size)

+        original_level = logging.root.level
        logging.basicConfig(level=logging.CRITICAL)
        result = sorted(_flatten_list(p.map(upload_task, all_files)))
-        logging.basicConfig(level=logging.INFO)
+        logging.basicConfig(level=original_level)
        return result

    def _upload_folder_to_s3(