ClickHouse/docker/test/stateful/s3downloader

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import os
import sys
import time
import tarfile
import logging
import argparse
import requests
import tempfile


DEFAULT_URL = "https://clickhouse-datasets.s3.amazonaws.com"

AVAILABLE_DATASETS = {
    "hits": "hits_v1.tar",
    "visits": "visits_v1.tar",
}

RETRIES_COUNT = 5


def _get_temp_file_name():
    return os.path.join(
        tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
    )


def build_url(base_url, dataset):
    return os.path.join(base_url, dataset, "partitions", AVAILABLE_DATASETS[dataset])


def download_with_progress(url, path):
    logging.info("Downloading from %s to temp path %s", url, path)
    for i in range(RETRIES_COUNT):
        try:
            with open(path, "wb") as f:
                response = requests.get(url, stream=True)
                response.raise_for_status()
                total_length = response.headers.get("content-length")
                if total_length is None or int(total_length) == 0:
                    logging.info(
                        "No content-length, will download file without progress"
                    )
                    f.write(response.content)
                else:
                    dl = 0
                    total_length = int(total_length)
                    logging.info("Content length is %ld bytes", total_length)
                    for data in response.iter_content(chunk_size=4096):
                        dl += len(data)
                        f.write(data)
                        if sys.stdout.isatty():
                            done = int(50 * dl / total_length)
                            percent = int(100 * float(dl) / total_length)
                            sys.stdout.write(
                                "\r[{}{}] {}%".format(
                                    "=" * done, " " * (50 - done), percent
                                )
                            )
                            sys.stdout.flush()
            break
        except Exception as ex:
            sys.stdout.write("\n")
            time.sleep(3)
            logging.info("Exception while downloading %s, retry %s", ex, i + 1)
            if os.path.exists(path):
                os.remove(path)
    else:
        raise Exception(
            "Cannot download dataset from {}, all retries exceeded".format(url)
        )

    sys.stdout.write("\n")
    logging.info("Downloading finished")


def unpack_to_clickhouse_directory(tar_path, clickhouse_path):
    logging.info(
        "Will unpack data from temp path %s to clickhouse db %s",
        tar_path,
        clickhouse_path,
    )
    with tarfile.open(tar_path, "r") as comp_file:
        comp_file.extractall(path=clickhouse_path)
    logging.info("Unpack finished")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)

    parser = argparse.ArgumentParser(
        description="Simple tool for dowloading datasets for clickhouse from S3"
    )

    parser.add_argument(
        "--dataset-names",
        required=True,
        nargs="+",
        choices=list(AVAILABLE_DATASETS.keys()),
    )
    parser.add_argument("--url-prefix", default=DEFAULT_URL)
    parser.add_argument("--clickhouse-data-path", default="/var/lib/clickhouse/")

    args = parser.parse_args()
    datasets = args.dataset_names
    logging.info("Will fetch following datasets: %s", ", ".join(datasets))
    for dataset in datasets:
        logging.info("Processing %s", dataset)
        temp_archive_path = _get_temp_file_name()
        try:
            download_url_for_dataset = build_url(args.url_prefix, dataset)
            download_with_progress(download_url_for_dataset, temp_archive_path)
            unpack_to_clickhouse_directory(temp_archive_path, args.clickhouse_data_path)
        except Exception as ex:
            logging.info("Some exception occured %s", str(ex))
            raise
        finally:
            logging.info(
                "Will remove downloaded file %s from filesystem if it exists",
                temp_archive_path,
            )
            if os.path.exists(temp_archive_path):
                os.remove(temp_archive_path)
        logging.info("Processing of %s finished", dataset)
    logging.info("Fetch finished, enjoy your tables!")
Convert to python3 (#15007) 2020-10-02 16:54:07 +00:00			`#!/usr/bin/env python3`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`# -- coding: utf-8 --`
			`import os`
			`import sys`
Add sleep 2020-09-08 08:45:01 +00:00			`import time`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`import tarfile`
			`import logging`
			`import argparse`
			`import requests`
			`import tempfile`


apply black formatter 2023-03-23 15:33:23 +00:00			`DEFAULT_URL = "https://clickhouse-datasets.s3.amazonaws.com"`
Add image for stateful tests 2019-01-21 08:46:28 +00:00
			`AVAILABLE_DATASETS = {`
apply black formatter 2023-03-23 15:33:23 +00:00			`"hits": "hits_v1.tar",`
			`"visits": "visits_v1.tar",`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`}`

Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`RETRIES_COUNT = 5`

apply black formatter 2023-03-23 15:33:23 +00:00
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`def _get_temp_file_name():`
apply black formatter 2023-03-23 15:33:23 +00:00			`return os.path.join(`
			`tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())`
			`)`

Add image for stateful tests 2019-01-21 08:46:28 +00:00
			`def build_url(base_url, dataset):`
apply black formatter 2023-03-23 15:33:23 +00:00			`return os.path.join(base_url, dataset, "partitions", AVAILABLE_DATASETS[dataset])`

Add image for stateful tests 2019-01-21 08:46:28 +00:00
Fix typo 2023-12-12 22:37:55 +00:00			`def download_with_progress(url, path):`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`logging.info("Downloading from %s to temp path %s", url, path)`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`for i in range(RETRIES_COUNT):`
			`try:`
apply black formatter 2023-03-23 15:33:23 +00:00			`with open(path, "wb") as f:`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`response = requests.get(url, stream=True)`
			`response.raise_for_status()`
apply black formatter 2023-03-23 15:33:23 +00:00			`total_length = response.headers.get("content-length")`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`if total_length is None or int(total_length) == 0:`
apply black formatter 2023-03-23 15:33:23 +00:00			`logging.info(`
			`"No content-length, will download file without progress"`
			`)`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`f.write(response.content)`
			`else:`
			`dl = 0`
			`total_length = int(total_length)`
			`logging.info("Content length is %ld bytes", total_length)`
			`for data in response.iter_content(chunk_size=4096):`
			`dl += len(data)`
			`f.write(data)`
			`if sys.stdout.isatty():`
			`done = int(50 * dl / total_length)`
			`percent = int(100 * float(dl) / total_length)`
apply black formatter 2023-03-23 15:33:23 +00:00			`sys.stdout.write(`
			`"\r[{}{}] {}%".format(`
			`"=" * done, " " * (50 - done), percent`
			`)`
			`)`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`sys.stdout.flush()`
			`break`
			`except Exception as ex:`
			`sys.stdout.write("\n")`
Add sleep 2020-09-08 08:45:01 +00:00			`time.sleep(3)`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00			`logging.info("Exception while downloading %s, retry %s", ex, i + 1)`
			`if os.path.exists(path):`
			`os.remove(path)`
Raise on error 2020-09-08 08:49:24 +00:00			`else:`
apply black formatter 2023-03-23 15:33:23 +00:00			`raise Exception(`
			`"Cannot download dataset from {}, all retries exceeded".format(url)`
			`)`
Add retries to s3 downloader 2020-09-08 08:43:02 +00:00
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`sys.stdout.write("\n")`
			`logging.info("Downloading finished")`

apply black formatter 2023-03-23 15:33:23 +00:00
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`def unpack_to_clickhouse_directory(tar_path, clickhouse_path):`
apply black formatter 2023-03-23 15:33:23 +00:00			`logging.info(`
			`"Will unpack data from temp path %s to clickhouse db %s",`
			`tar_path,`
			`clickhouse_path,`
			`)`
			`with tarfile.open(tar_path, "r") as comp_file:`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`comp_file.extractall(path=clickhouse_path)`
			`logging.info("Unpack finished")`


			`if __name__ == "__main__":`
			`logging.basicConfig(level=logging.INFO)`

			`parser = argparse.ArgumentParser(`
apply black formatter 2023-03-23 15:33:23 +00:00			`description="Simple tool for dowloading datasets for clickhouse from S3"`
			`)`

			`parser.add_argument(`
			`"--dataset-names",`
			`required=True,`
			`nargs="+",`
			`choices=list(AVAILABLE_DATASETS.keys()),`
			`)`
			`parser.add_argument("--url-prefix", default=DEFAULT_URL)`
			`parser.add_argument("--clickhouse-data-path", default="/var/lib/clickhouse/")`
Add image for stateful tests 2019-01-21 08:46:28 +00:00
			`args = parser.parse_args()`
			`datasets = args.dataset_names`
apply black formatter 2023-03-23 15:33:23 +00:00			`logging.info("Will fetch following datasets: %s", ", ".join(datasets))`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`for dataset in datasets:`
			`logging.info("Processing %s", dataset)`
			`temp_archive_path = _get_temp_file_name()`
			`try:`
			`download_url_for_dataset = build_url(args.url_prefix, dataset)`
Fix typo 2023-12-12 22:37:55 +00:00			`download_with_progress(download_url_for_dataset, temp_archive_path)`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`unpack_to_clickhouse_directory(temp_archive_path, args.clickhouse_data_path)`
			`except Exception as ex:`
			`logging.info("Some exception occured %s", str(ex))`
			`raise`
			`finally:`
apply black formatter 2023-03-23 15:33:23 +00:00			`logging.info(`
			`"Will remove downloaded file %s from filesystem if it exists",`
			`temp_archive_path,`
			`)`
Add image for stateful tests 2019-01-21 08:46:28 +00:00			`if os.path.exists(temp_archive_path):`
			`os.remove(temp_archive_path)`
			`logging.info("Processing of %s finished", dataset)`
			`logging.info("Fetch finished, enjoy your tables!")`