ClickHouse/utils/s3tools/s3uploader

223 lines
8.4 KiB
Plaintext
Raw Normal View History

2020-10-02 16:54:07 +00:00
#!/usr/bin/env python3
2019-01-29 12:12:28 +00:00
# -*- coding: utf-8 -*-
import os
import logging
import argparse
import tarfile
import math
try:
from boto.s3.connection import S3Connection
2021-09-08 17:22:24 +00:00
from boto.s3.key import Key
2019-01-29 12:12:28 +00:00
except ImportError:
raise ImportError("You have to install boto package 'pip install boto'")
class S3API(object):
def __init__(self, access_key, secret_access_key, mds_api, mds_url):
self.connection = S3Connection(
host=mds_api,
aws_access_key_id=access_key,
aws_secret_access_key=secret_access_key,
)
self.mds_url = mds_url
def upload_file(self, bucket_name, file_path, s3_path):
logging.info("Start uploading file to bucket %s", bucket_name)
2021-09-03 21:43:15 +00:00
2019-01-29 12:12:28 +00:00
bucket = self.connection.get_bucket(bucket_name)
key = bucket.initiate_multipart_upload(s3_path)
logging.info("Will upload to s3 path %s", s3_path)
2023-03-23 15:33:23 +00:00
chunksize = 1024 * 1024 * 1024 # 1 GB
2019-01-29 12:12:28 +00:00
filesize = os.stat(file_path).st_size
2021-09-03 21:43:15 +00:00
logging.info("File size is %s", filesize)
2019-01-29 12:12:28 +00:00
chunkcount = int(math.ceil(filesize / chunksize))
def call_back(x, y):
2020-10-02 16:54:07 +00:00
print("Uploaded {}/{} bytes".format(x, y))
2023-03-23 15:33:23 +00:00
2019-01-29 12:12:28 +00:00
try:
for i in range(chunkcount + 1):
logging.info("Uploading chunk %s of %s", i, chunkcount + 1)
offset = chunksize * i
bytes_size = min(chunksize, filesize - offset)
2021-09-03 21:43:15 +00:00
2023-03-23 15:33:23 +00:00
with open(file_path, "r") as fp:
2019-01-29 12:12:28 +00:00
fp.seek(offset)
2023-03-23 15:33:23 +00:00
key.upload_part_from_file(
fp=fp, part_num=i + 1, size=bytes_size, cb=call_back, num_cb=100
)
2019-01-29 12:12:28 +00:00
key.complete_upload()
except Exception as ex:
key.cancel_upload()
raise ex
logging.info("Contents were set")
return "https://{bucket}.{mds_url}/{path}".format(
2023-03-23 15:33:23 +00:00
bucket=bucket_name, mds_url=self.mds_url, path=s3_path
)
2019-01-29 12:12:28 +00:00
2021-09-03 21:43:15 +00:00
def set_file_contents(self, bucket, local_file_path, s3_file_path):
key = Key(bucket)
key.key = s3_file_path
file_size = os.stat(local_file_path).st_size
2023-03-23 15:33:23 +00:00
logging.info(
"Uploading file `%s` to `%s`. Size is %s",
local_file_path,
s3_file_path,
file_size,
)
2021-09-03 21:43:15 +00:00
def call_back(x, y):
print("Uploaded {}/{} bytes".format(x, y))
2023-03-23 15:33:23 +00:00
2021-09-03 21:43:15 +00:00
key.set_contents_from_filename(local_file_path, cb=call_back)
def upload_data_for_static_files_disk(self, bucket_name, directory_path, s3_path):
bucket = self.connection.get_bucket(bucket_name)
2021-09-08 17:22:24 +00:00
if s3_path.endswith("/"):
s3_path += "store/"
else:
s3_path += "/store/"
print(s3_path)
for root, dirs, files in os.walk(directory_path):
path = root.split(os.sep)
for file in files:
local_file_path = os.path.join(root, file)
2023-03-23 15:33:23 +00:00
s3_file = local_file_path[len(directory_path) + 1 :]
2021-09-08 17:22:24 +00:00
s3_file_path = os.path.join(s3_path, s3_file)
2021-09-03 21:43:15 +00:00
self.set_file_contents(bucket, local_file_path, s3_file_path)
logging.info("Uploading finished")
2023-03-23 15:33:23 +00:00
return "https://{bucket}.{mds_url}/{path}".format(
bucket=bucket_name, mds_url=self.mds_url, path=s3_path
)
2021-09-03 21:43:15 +00:00
2021-09-08 17:22:24 +00:00
def list_bucket_keys(self, bucket_name):
2021-09-03 21:43:15 +00:00
bucket = self.connection.get_bucket(bucket_name)
for obj in bucket.get_all_keys():
print(obj.key)
def remove_folder_from_bucket(self, bucket_name, folder_path):
bucket = self.connection.get_bucket(bucket_name)
bucket.get_all_keys()
for obj in bucket.get_all_keys():
if obj.key.startswith(folder_path):
2023-03-23 15:33:23 +00:00
print("Removing " + obj.key)
2021-09-03 21:43:15 +00:00
obj.delete()
2019-01-29 12:12:28 +00:00
2023-03-23 15:33:23 +00:00
def make_tar_file_for_table(clickhouse_data_path, db_name, table_name, tmp_prefix):
relative_data_path = os.path.join("data", db_name, table_name)
relative_meta_path = os.path.join("metadata", db_name, table_name + ".sql")
2019-01-29 12:12:28 +00:00
path_to_data = os.path.join(clickhouse_data_path, relative_data_path)
path_to_metadata = os.path.join(clickhouse_data_path, relative_meta_path)
2023-03-23 15:33:23 +00:00
temporary_file_name = tmp_prefix + "/{tname}.tar".format(tname=table_name)
2019-01-29 12:12:28 +00:00
with tarfile.open(temporary_file_name, "w") as bundle:
bundle.add(path_to_data, arcname=relative_data_path)
bundle.add(path_to_metadata, arcname=relative_meta_path)
return temporary_file_name
2023-03-23 15:33:23 +00:00
USAGE_EXAMPLES = """
2019-01-29 12:12:28 +00:00
examples:
2020-09-18 14:56:42 +00:00
\t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --clickhouse-data-path /opt/clickhouse/ --table-name default.some_tbl --bucket-name some-bucket
\t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --file-path some_ds.tsv.xz --bucket-name some-bucket --s3-path /path/to/
2023-03-23 15:33:23 +00:00
"""
2019-01-29 12:12:28 +00:00
if __name__ == "__main__":
2023-03-23 15:33:23 +00:00
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
2019-01-29 12:12:28 +00:00
parser = argparse.ArgumentParser(
description="Simple tool for uploading datasets to clickhouse S3",
2023-03-23 15:33:23 +00:00
usage="%(prog)s [options] {}".format(USAGE_EXAMPLES),
)
parser.add_argument("--s3-api-url", default="s3.amazonaws.com")
parser.add_argument("--s3-common-url", default="s3.amazonaws.com")
parser.add_argument("--bucket-name", default="clickhouse-datasets")
parser.add_argument(
"--dataset-name",
required=True,
help="Name of dataset, will be used in uploaded path",
)
parser.add_argument("--access-key-id", required=True)
parser.add_argument("--secret-access-key", required=True)
parser.add_argument(
"--clickhouse-data-path",
default="/var/lib/clickhouse/",
help="Path to clickhouse database on filesystem",
)
parser.add_argument("--s3-path", help="Path in s3, where to upload file")
parser.add_argument(
"--tmp-prefix", default="/tmp", help="Prefix to store temporary downloaded file"
)
2019-01-29 12:12:28 +00:00
data_group = parser.add_mutually_exclusive_group(required=True)
2023-03-23 15:33:23 +00:00
table_name_argument = data_group.add_argument(
"--table-name",
help="Name of table with database, if you are uploading partitions",
)
data_group.add_argument("--file-path", help="Name of file, if you are uploading")
data_group.add_argument(
"--directory-path", help="Path to directory with files to upload"
)
data_group.add_argument(
"--list-directory", help="List s3 directory by --directory-path"
)
data_group.add_argument(
"--remove-directory", help="Remove s3 directory by --directory-path"
)
2019-01-29 12:12:28 +00:00
args = parser.parse_args()
if args.table_name is not None and args.clickhouse_data_path is None:
2023-03-23 15:33:23 +00:00
raise argparse.ArgumentError(
table_name_argument,
"You should specify --clickhouse-data-path to upload --table",
)
2019-01-29 12:12:28 +00:00
s3_conn = S3API(
2023-03-23 15:33:23 +00:00
args.access_key_id, args.secret_access_key, args.s3_api_url, args.s3_common_url
)
2019-01-29 12:12:28 +00:00
2023-03-23 15:33:23 +00:00
file_path = ""
2021-09-08 17:22:24 +00:00
directory_path = args.directory_path
2021-09-03 21:43:15 +00:00
s3_path = args.s3_path
2021-09-08 17:22:24 +00:00
if args.list_directory:
s3_conn.list_bucket_keys(args.bucket_name)
elif args.remove_directory:
2023-03-23 15:33:23 +00:00
print("Removing s3 path: " + args.remove_directory)
2021-09-08 17:22:24 +00:00
s3_conn.remove_folder_from_bucket(args.bucket_name, args.remove_directory)
2021-09-03 21:43:15 +00:00
elif args.directory_path is not None:
2023-03-23 15:33:23 +00:00
url = s3_conn.upload_data_for_static_files_disk(
args.bucket_name, directory_path, s3_path
)
2021-09-08 17:22:24 +00:00
logging.info("Data uploaded: %s", url)
2019-01-29 12:12:28 +00:00
else:
2021-09-03 21:43:15 +00:00
if args.table_name is not None:
2023-03-23 15:33:23 +00:00
if "." not in args.table_name:
db_name = "default"
2021-09-03 21:43:15 +00:00
else:
2023-03-23 15:33:23 +00:00
db_name, table_name = args.table_name.split(".")
2021-09-03 21:43:15 +00:00
file_path = make_tar_file_for_table(
2023-03-23 15:33:23 +00:00
args.clickhouse_data_path, db_name, table_name, args.tmp_prefix
)
2021-09-03 21:43:15 +00:00
else:
file_path = args.file_path
2023-03-23 15:33:23 +00:00
if "tsv" in file_path:
2021-09-03 21:43:15 +00:00
s3_path = os.path.join(
2023-03-23 15:33:23 +00:00
args.dataset_name, "tsv", os.path.basename(file_path)
)
2021-09-03 21:43:15 +00:00
if args.table_name is not None:
s3_path = os.path.join(
2023-03-23 15:33:23 +00:00
args.dataset_name, "partitions", os.path.basename(file_path)
)
2021-09-03 21:43:15 +00:00
elif args.s3_path is not None:
s3_path = os.path.join(
2023-03-23 15:33:23 +00:00
args.dataset_name, args.s3_path, os.path.basename(file_path)
)
2021-09-03 21:43:15 +00:00
else:
raise Exception("Don't know s3-path to upload")
url = s3_conn.upload_file(args.bucket_name, file_path, s3_path)
logging.info("Data uploaded: %s", url)