apply black formatter

This commit is contained in:
Anton Popov 2023-03-23 15:33:23 +00:00
parent 21f5d20b9e
commit 0ee8dfad53
31 changed files with 1999 additions and 1059 deletions

View File

@ -10,31 +10,38 @@ import requests
import tempfile
DEFAULT_URL = 'https://clickhouse-datasets.s3.amazonaws.com'
DEFAULT_URL = "https://clickhouse-datasets.s3.amazonaws.com"
AVAILABLE_DATASETS = {
'hits': 'hits_v1.tar',
'visits': 'visits_v1.tar',
"hits": "hits_v1.tar",
"visits": "visits_v1.tar",
}
RETRIES_COUNT = 5
def _get_temp_file_name():
return os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
return os.path.join(
tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
)
def build_url(base_url, dataset):
return os.path.join(base_url, dataset, 'partitions', AVAILABLE_DATASETS[dataset])
return os.path.join(base_url, dataset, "partitions", AVAILABLE_DATASETS[dataset])
def dowload_with_progress(url, path):
logging.info("Downloading from %s to temp path %s", url, path)
for i in range(RETRIES_COUNT):
try:
with open(path, 'wb') as f:
with open(path, "wb") as f:
response = requests.get(url, stream=True)
response.raise_for_status()
total_length = response.headers.get('content-length')
total_length = response.headers.get("content-length")
if total_length is None or int(total_length) == 0:
logging.info("No content-length, will download file without progress")
logging.info(
"No content-length, will download file without progress"
)
f.write(response.content)
else:
dl = 0
@ -46,7 +53,11 @@ def dowload_with_progress(url, path):
if sys.stdout.isatty():
done = int(50 * dl / total_length)
percent = int(100 * float(dl) / total_length)
sys.stdout.write("\r[{}{}] {}%".format('=' * done, ' ' * (50-done), percent))
sys.stdout.write(
"\r[{}{}] {}%".format(
"=" * done, " " * (50 - done), percent
)
)
sys.stdout.flush()
break
except Exception as ex:
@ -56,14 +67,21 @@ def dowload_with_progress(url, path):
if os.path.exists(path):
os.remove(path)
else:
raise Exception("Cannot download dataset from {}, all retries exceeded".format(url))
raise Exception(
"Cannot download dataset from {}, all retries exceeded".format(url)
)
sys.stdout.write("\n")
logging.info("Downloading finished")
def unpack_to_clickhouse_directory(tar_path, clickhouse_path):
logging.info("Will unpack data from temp path %s to clickhouse db %s", tar_path, clickhouse_path)
with tarfile.open(tar_path, 'r') as comp_file:
logging.info(
"Will unpack data from temp path %s to clickhouse db %s",
tar_path,
clickhouse_path,
)
with tarfile.open(tar_path, "r") as comp_file:
comp_file.extractall(path=clickhouse_path)
logging.info("Unpack finished")
@ -72,15 +90,21 @@ if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(
description="Simple tool for dowloading datasets for clickhouse from S3")
description="Simple tool for dowloading datasets for clickhouse from S3"
)
parser.add_argument('--dataset-names', required=True, nargs='+', choices=list(AVAILABLE_DATASETS.keys()))
parser.add_argument('--url-prefix', default=DEFAULT_URL)
parser.add_argument('--clickhouse-data-path', default='/var/lib/clickhouse/')
parser.add_argument(
"--dataset-names",
required=True,
nargs="+",
choices=list(AVAILABLE_DATASETS.keys()),
)
parser.add_argument("--url-prefix", default=DEFAULT_URL)
parser.add_argument("--clickhouse-data-path", default="/var/lib/clickhouse/")
args = parser.parse_args()
datasets = args.dataset_names
logging.info("Will fetch following datasets: %s", ', '.join(datasets))
logging.info("Will fetch following datasets: %s", ", ".join(datasets))
for dataset in datasets:
logging.info("Processing %s", dataset)
temp_archive_path = _get_temp_file_name()
@ -92,10 +116,11 @@ if __name__ == "__main__":
logging.info("Some exception occured %s", str(ex))
raise
finally:
logging.info("Will remove downloaded file %s from filesystem if it exists", temp_archive_path)
logging.info(
"Will remove downloaded file %s from filesystem if it exists",
temp_archive_path,
)
if os.path.exists(temp_archive_path):
os.remove(temp_archive_path)
logging.info("Processing of %s finished", dataset)
logging.info("Fetch finished, enjoy your tables!")

View File

@ -77,7 +77,7 @@ def trim_for_log(s):
return s
lines = s.splitlines()
if len(lines) > 10000:
separator = "-" * 40 + str(len(lines) - 10000) + " lines are hidden" + "-" * 40
separator = "-" * 40 + str(len(lines) - 10000) + " lines are hidden" + "-" * 40
return "\n".join(lines[:5000] + [] + [separator] + [] + lines[-5000:])
else:
return "\n".join(lines)
@ -95,7 +95,13 @@ class HTTPError(Exception):
# Helpers to execute queries via HTTP interface.
def clickhouse_execute_http(
base_args, query, timeout=30, settings=None, default_format=None, max_http_retries=5, retry_error_codes=False
base_args,
query,
timeout=30,
settings=None,
default_format=None,
max_http_retries=5,
retry_error_codes=False,
):
if args.secure:
client = http.client.HTTPSConnection(
@ -146,12 +152,36 @@ def clickhouse_execute_http(
return data
def clickhouse_execute(base_args, query, timeout=30, settings=None, max_http_retries=5, retry_error_codes=False):
return clickhouse_execute_http(base_args, query, timeout, settings, max_http_retries=max_http_retries, retry_error_codes=retry_error_codes).strip()
def clickhouse_execute(
base_args,
query,
timeout=30,
settings=None,
max_http_retries=5,
retry_error_codes=False,
):
return clickhouse_execute_http(
base_args,
query,
timeout,
settings,
max_http_retries=max_http_retries,
retry_error_codes=retry_error_codes,
).strip()
def clickhouse_execute_json(base_args, query, timeout=60, settings=None, max_http_retries=5):
data = clickhouse_execute_http(base_args, query, timeout, settings, "JSONEachRow", max_http_retries=max_http_retries)
def clickhouse_execute_json(
base_args, query, timeout=60, settings=None, max_http_retries=5
):
data = clickhouse_execute_http(
base_args,
query,
timeout,
settings,
"JSONEachRow",
max_http_retries=max_http_retries,
)
if not data:
return None
rows = []
@ -648,7 +678,9 @@ class TestCase:
clickhouse_execute(
args,
"CREATE DATABASE IF NOT EXISTS " + database + get_db_engine(testcase_args, database),
"CREATE DATABASE IF NOT EXISTS "
+ database
+ get_db_engine(testcase_args, database),
settings=get_create_database_settings(args, testcase_args),
)
@ -831,7 +863,8 @@ class TestCase:
# TODO: remove checking "no-upgrade-check" after 23.1
elif args.upgrade_check and (
"no-upgrade-check" in tags or "no-upgrade-check" in tags):
"no-upgrade-check" in tags or "no-upgrade-check" in tags
):
return FailureReason.NO_UPGRADE_CHECK
elif tags and ("no-s3-storage" in tags) and args.s3_storage:
@ -1051,7 +1084,11 @@ class TestCase:
@staticmethod
def send_test_name_failed(suite: str, case: str):
pid = os.getpid()
clickhouse_execute(args, f"SELECT 'Running test {suite}/{case} from pid={pid}'", retry_error_codes=True)
clickhouse_execute(
args,
f"SELECT 'Running test {suite}/{case} from pid={pid}'",
retry_error_codes=True,
)
def run_single_test(
self, server_logs_level, client_options
@ -2220,6 +2257,7 @@ def find_binary(name):
raise Exception(f"{name} was not found in PATH")
def find_clickhouse_command(binary, command):
symlink = binary + "-" + command
if os.access(symlink, os.X_OK):
@ -2228,6 +2266,7 @@ def find_clickhouse_command(binary, command):
# To avoid requiring symlinks (in case you download binary from CI)
return binary + " " + command
def get_additional_client_options(args):
if args.client_option:
return " ".join("--" + option for option in args.client_option)
@ -2569,7 +2608,9 @@ if __name__ == "__main__":
"WARNING: --extract_from_config option is deprecated and will be removed the the future",
file=sys.stderr,
)
args.extract_from_config = find_clickhouse_command(args.binary, "extract-from-config")
args.extract_from_config = find_clickhouse_command(
args.binary, "extract-from-config"
)
if args.configclient:
args.client += " --config-file=" + args.configclient

View File

@ -243,11 +243,18 @@ if __name__ == "__main__":
)
parser.add_argument(
"--no-random", action="store", dest="no_random", help="Disable tests order randomization"
"--no-random",
action="store",
dest="no_random",
help="Disable tests order randomization",
)
parser.add_argument(
"--pre-pull", action="store_true", default=False, dest="pre_pull", help="Pull images for docker_compose before all other actions"
"--pre-pull",
action="store_true",
default=False,
dest="pre_pull",
help="Pull images for docker_compose before all other actions",
)
parser.add_argument(
@ -306,7 +313,6 @@ if __name__ == "__main__":
# if not args.no_random:
# rand_args += f"--random-seed={os.getpid()}"
net = ""
if args.network:
net = "--net={}".format(args.network)
@ -416,8 +422,11 @@ if __name__ == "__main__":
name=CONTAINER_NAME,
)
cmd = cmd_base + " " + args.command
cmd_pre_pull = cmd_base + " find /compose -name docker_compose_*.yml -exec docker-compose -f '{}' pull \;"
cmd = cmd_base + " " + args.command
cmd_pre_pull = (
cmd_base
+ " find /compose -name docker_compose_*.yml -exec docker-compose -f '{}' pull \;"
)
containers = subprocess.check_output(
f"docker ps --all --quiet --filter name={CONTAINER_NAME} --format={{{{.ID}}}}",

View File

@ -1,57 +1,72 @@
#!/usr/bin/env python3
def gen_queries():
create_template = 'create table tab_00386 (a Int8, b String, c Tuple(Int8), d Tuple(Tuple(Int8)), e Tuple(Int8, String), f Tuple(Tuple(Int8, String))) engine = MergeTree order by ({}) partition by {}'
drop_query = 'drop table if exists tab_00386'
values = ('1', "'a'", 'tuple(1)', 'tuple(tuple(1))', "(1, 'a')", "tuple((1, 'a'))")
create_template = "create table tab_00386 (a Int8, b String, c Tuple(Int8), d Tuple(Tuple(Int8)), e Tuple(Int8, String), f Tuple(Tuple(Int8, String))) engine = MergeTree order by ({}) partition by {}"
drop_query = "drop table if exists tab_00386"
values = ("1", "'a'", "tuple(1)", "tuple(tuple(1))", "(1, 'a')", "tuple((1, 'a'))")
insert_query = "insert into tab_00386 values (1, 'a', tuple(1), tuple(tuple(1)), (1, 'a'), tuple((1, 'a')))"
columns = tuple('a b c d'.split())
order_by_columns = tuple('a b c'.split())
partition_by_columns = tuple(' tuple() a'.split())
columns = tuple("a b c d".split())
order_by_columns = tuple("a b c".split())
partition_by_columns = tuple(" tuple() a".split())
for partition in partition_by_columns:
for key_mask in range(1, 1 << len(order_by_columns)):
key = ','.join(order_by_columns[i] for i in range(len(order_by_columns)) if (1 << i) & key_mask != 0)
key = ",".join(
order_by_columns[i]
for i in range(len(order_by_columns))
if (1 << i) & key_mask != 0
)
create_query = create_template.format(key, partition)
for q in (drop_query, create_query, insert_query):
yield q
for column, value in zip(columns, values):
yield 'select {} in {} from tab_00386'.format(column, value)
yield 'select {} in tuple({}) from tab_00386'.format(column, value)
yield 'select {} in (select {} from tab_00386) from tab_00386'.format(column, column)
yield "select {} in {} from tab_00386".format(column, value)
yield "select {} in tuple({}) from tab_00386".format(column, value)
yield "select {} in (select {} from tab_00386) from tab_00386".format(
column, column
)
for i in range(len(columns)):
for j in range(i, len(columns)):
yield 'select ({}, {}) in tuple({}, {}) from tab_00386'.format(columns[i], columns[j], values[i], values[j])
yield 'select ({}, {}) in (select {}, {} from tab_00386) from tab_00386'.format(columns[i], columns[j], columns[i], columns[j])
yield 'select ({}, {}) in (select ({}, {}) from tab_00386) from tab_00386'.format(columns[i], columns[j], columns[i], columns[j])
yield "select ({}, {}) in tuple({}, {}) from tab_00386".format(
columns[i], columns[j], values[i], values[j]
)
yield "select ({}, {}) in (select {}, {} from tab_00386) from tab_00386".format(
columns[i], columns[j], columns[i], columns[j]
)
yield "select ({}, {}) in (select ({}, {}) from tab_00386) from tab_00386".format(
columns[i], columns[j], columns[i], columns[j]
)
yield "select e in (1, 'a') from tab_00386"
yield "select f in tuple((1, 'a')) from tab_00386"
yield "select f in tuple(tuple((1, 'a'))) from tab_00386"
yield 'select e in (select a, b from tab_00386) from tab_00386'
yield 'select e in (select (a, b) from tab_00386) from tab_00386'
yield 'select f in (select tuple((a, b)) from tab_00386) from tab_00386'
yield 'select tuple(f) in (select tuple(tuple((a, b))) from tab_00386) from tab_00386'
yield "select e in (select a, b from tab_00386) from tab_00386"
yield "select e in (select (a, b) from tab_00386) from tab_00386"
yield "select f in (select tuple((a, b)) from tab_00386) from tab_00386"
yield "select tuple(f) in (select tuple(tuple((a, b))) from tab_00386) from tab_00386"
import requests
import os
def main():
url = os.environ['CLICKHOUSE_URL']
url = os.environ["CLICKHOUSE_URL"]
for q in gen_queries():
resp = requests.post(url, data=q)
if resp.status_code != 200 or resp.text.strip() not in ('1', ''):
print('Query:', q)
print('Code:', resp.status_code)
if resp.status_code != 200 or resp.text.strip() not in ("1", ""):
print("Query:", q)
print("Code:", resp.status_code)
print(resp.text)
break
requests.post(url, data='drop table tab_00386')
requests.post(url, data="drop table tab_00386")
if __name__ == "__main__":
main()

View File

@ -2,8 +2,20 @@
import os, itertools, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, sys
def get_ch_answer(query):
return urllib.request.urlopen(os.environ.get('CLICKHOUSE_URL', 'http://localhost:' + os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') ), data=query.encode()).read().decode()
return (
urllib.request.urlopen(
os.environ.get(
"CLICKHOUSE_URL",
"http://localhost:" + os.environ.get("CLICKHOUSE_PORT_HTTP", "8123"),
),
data=query.encode(),
)
.read()
.decode()
)
def check_answers(query, answer):
ch_answer = get_ch_answer(query)
@ -13,36 +25,34 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer)
exit(-1)
def get_values():
values = [0, 1, -1]
for bits in [8, 16, 32, 64]:
values += [2**bits, 2**bits - 1]
values += [2**(bits-1) - 1, 2**(bits-1), 2**(bits-1) + 1]
values += [-2**(bits-1) - 1, -2**(bits-1), -2**(bits-1) + 1]
values += [2 ** (bits - 1) - 1, 2 ** (bits - 1), 2 ** (bits - 1) + 1]
values += [-(2 ** (bits - 1)) - 1, -(2 ** (bits - 1)), -(2 ** (bits - 1)) + 1]
return values
def is_valid_integer(x):
return -2**63 <= x and x <= 2**64-1
return -(2**63) <= x and x <= 2**64 - 1
TEST_WITH_CASTING=True
GENERATE_TEST_FILES=False
TEST_WITH_CASTING = True
GENERATE_TEST_FILES = False
TYPES = {
"UInt8" : { "bits" : 8, "sign" : False, "float" : False },
"Int8" : { "bits" : 8, "sign" : True, "float" : False },
"UInt16": { "bits" : 16, "sign" : False, "float" : False },
"Int16" : { "bits" : 16, "sign" : True, "float" : False },
"UInt32": { "bits" : 32, "sign" : False, "float" : False },
"Int32" : { "bits" : 32, "sign" : True, "float" : False },
"UInt64": { "bits" : 64, "sign" : False, "float" : False },
"Int64" : { "bits" : 64, "sign" : True, "float" : False }
#"Float32" : { "bits" : 32, "sign" : True, "float" : True },
#"Float64" : { "bits" : 64, "sign" : True, "float" : True }
"UInt8": {"bits": 8, "sign": False, "float": False},
"Int8": {"bits": 8, "sign": True, "float": False},
"UInt16": {"bits": 16, "sign": False, "float": False},
"Int16": {"bits": 16, "sign": True, "float": False},
"UInt32": {"bits": 32, "sign": False, "float": False},
"Int32": {"bits": 32, "sign": True, "float": False},
"UInt64": {"bits": 64, "sign": False, "float": False},
"Int64": {"bits": 64, "sign": True, "float": False}
# "Float32" : { "bits" : 32, "sign" : True, "float" : True },
# "Float64" : { "bits" : 64, "sign" : True, "float" : True }
}
@ -55,14 +65,18 @@ def inside_range(value, type_name):
return True
if signed:
return -2**(bits-1) <= value and value <= 2**(bits-1) - 1
return -(2 ** (bits - 1)) <= value and value <= 2 ** (bits - 1) - 1
else:
return 0 <= value and value <= 2**bits - 1
def test_operators(v1, v2, v1_passed, v2_passed):
query_str = "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2},\t".format(v1=v1_passed, v2=v2_passed)
query_str += "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2} ".format(v1=v2_passed, v2=v1_passed)
query_str = "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2},\t".format(
v1=v1_passed, v2=v2_passed
)
query_str += "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2} ".format(
v1=v2_passed, v2=v1_passed
)
answers = [v1 == v2, v1 != v2, v1 < v2, v1 <= v2, v1 > v2, v1 >= v2]
answers += [v2 == v1, v2 != v1, v2 < v1, v2 <= v1, v2 > v1, v2 >= v1]
@ -74,6 +88,7 @@ def test_operators(v1, v2, v1_passed, v2_passed):
VALUES = [x for x in get_values() if is_valid_integer(x)]
def test_pair(v1, v2):
query = "SELECT {}, {}, ".format(v1, v2)
answers = "{}\t{}\t".format(v1, v2)
@ -87,19 +102,58 @@ def test_pair(v1, v2):
if inside_range(v1, t1):
for t2 in TYPES.keys():
if inside_range(v2, t2):
q, a = test_operators(v1, v2, 'to{}({})'.format(t1, v1), 'to{}({})'.format(t2, v2))
query += ', ' + q
q, a = test_operators(
v1, v2, "to{}({})".format(t1, v1), "to{}({})".format(t2, v2)
)
query += ", " + q
answers += "\t" + a
check_answers(query, answers)
return query, answers
VALUES_INT = [0, -1, 1, 2**64-1, 2**63, -2**63, 2**63-1, 2**51, 2**52, 2**53-1, 2**53, 2**53+1, 2**53+2, -2**53+1, -2**53, -2**53-1, -2**53-2, 2*52, -2**52]
VALUES_FLOAT = [float(x) for x in VALUES_INT + [-0.5, 0.5, -1.5, 1.5, 2**53, 2**51 - 0.5, 2**51 + 0.5, 2**60, -2**60, -2**63 - 10000, 2**63 + 10000]]
VALUES_INT = [
0,
-1,
1,
2**64 - 1,
2**63,
-(2**63),
2**63 - 1,
2**51,
2**52,
2**53 - 1,
2**53,
2**53 + 1,
2**53 + 2,
-(2**53) + 1,
-(2**53),
-(2**53) - 1,
-(2**53) - 2,
2 * 52,
-(2**52),
]
VALUES_FLOAT = [
float(x)
for x in VALUES_INT
+ [
-0.5,
0.5,
-1.5,
1.5,
2**53,
2**51 - 0.5,
2**51 + 0.5,
2**60,
-(2**60),
-(2**63) - 10000,
2**63 + 10000,
]
]
def test_float_pair(i, f):
f_str = ("%.9f" % f)
f_str = "%.9f" % f
query = "SELECT '{}', '{}', ".format(i, f_str)
answers = "{}\t{}\t".format(i, f_str)
@ -110,8 +164,8 @@ def test_float_pair(i, f):
if TEST_WITH_CASTING:
for t1 in TYPES.keys():
if inside_range(i, t1):
q, a = test_operators(i, f, 'to{}({})'.format(t1, i), f_str)
query += ', ' + q
q, a = test_operators(i, f, "to{}({})".format(t1, i), f_str)
query += ", " + q
answers += "\t" + a
check_answers(query, answers)
@ -120,22 +174,26 @@ def test_float_pair(i, f):
def main():
if GENERATE_TEST_FILES:
base_name = '00411_accurate_number_comparison'
sql_file = open(base_name + '.sql', 'wt')
ref_file = open(base_name + '.reference', 'wt')
base_name = "00411_accurate_number_comparison"
sql_file = open(base_name + ".sql", "wt")
ref_file = open(base_name + ".reference", "wt")
num_int_tests = len(list(itertools.combinations(VALUES, 2)))
num_parts = 4
for part in range(0, num_parts):
if 'int' + str(part + 1) in sys.argv[1:]:
for (v1, v2) in itertools.islice(itertools.combinations(VALUES, 2), part * num_int_tests // num_parts, (part + 1) * num_int_tests // num_parts):
if "int" + str(part + 1) in sys.argv[1:]:
for (v1, v2) in itertools.islice(
itertools.combinations(VALUES, 2),
part * num_int_tests // num_parts,
(part + 1) * num_int_tests // num_parts,
):
q, a = test_pair(v1, v2)
if GENERATE_TEST_FILES:
sql_file.write(q + ";\n")
ref_file.write(a + "\n")
if 'float' in sys.argv[1:]:
if "float" in sys.argv[1:]:
for (i, f) in itertools.product(VALUES_INT, VALUES_FLOAT):
q, a = test_float_pair(i, f)
if GENERATE_TEST_FILES:

View File

@ -12,6 +12,7 @@ import subprocess
from io import StringIO
from http.server import BaseHTTPRequestHandler, HTTPServer
def is_ipv6(host):
try:
socket.inet_aton(host)
@ -19,6 +20,7 @@ def is_ipv6(host):
except:
return True
def get_local_port(host, ipv6):
if ipv6:
family = socket.AF_INET6
@ -29,8 +31,9 @@ def get_local_port(host, ipv6):
fd.bind((host, 0))
return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
#####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -39,27 +42,42 @@ CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
#####################################################################################
# IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0]
HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
CSV_DATA = os.path.join(
tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
)
CSV_DATA = os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
def get_ch_answer(query):
host = CLICKHOUSE_HOST
if IS_IPV6:
host = f'[{host}]'
host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP))
url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer):
ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip():
@ -68,18 +86,19 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query")
class CSVHTTPServer(BaseHTTPRequestHandler):
def _set_headers(self):
self.send_response(200)
self.send_header('Content-type', 'text/csv')
self.send_header("Content-type", "text/csv")
self.end_headers()
def do_GET(self):
self._set_headers()
with open(CSV_DATA, 'r') as fl:
reader = csv.reader(fl, delimiter=',')
with open(CSV_DATA, "r") as fl:
reader = csv.reader(fl, delimiter=",")
for row in reader:
self.wfile.write((', '.join(row) + '\n').encode())
self.wfile.write((", ".join(row) + "\n").encode())
return
def do_HEAD(self):
@ -87,33 +106,33 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
return
def read_chunk(self):
msg = ''
msg = ""
while True:
sym = self.rfile.read(1)
if sym == '':
if sym == "":
break
msg += sym.decode('utf-8')
if msg.endswith('\r\n'):
msg += sym.decode("utf-8")
if msg.endswith("\r\n"):
break
length = int(msg[:-2], 16)
if length == 0:
return ''
return ""
content = self.rfile.read(length)
self.rfile.read(2) # read sep \r\n
return content.decode('utf-8')
self.rfile.read(2) # read sep \r\n
return content.decode("utf-8")
def do_POST(self):
data = ''
data = ""
while True:
chunk = self.read_chunk()
if not chunk:
break
data += chunk
with StringIO(data) as fl:
reader = csv.reader(fl, delimiter=',')
with open(CSV_DATA, 'a') as d:
reader = csv.reader(fl, delimiter=",")
with open(CSV_DATA, "a") as d:
for row in reader:
d.write(','.join(row) + '\n')
d.write(",".join(row) + "\n")
self._set_headers()
self.wfile.write(b"ok")
@ -124,6 +143,7 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6
def start_server():
if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, CSVHTTPServer)
@ -133,49 +153,76 @@ def start_server():
t = threading.Thread(target=httpd.serve_forever)
return t, httpd
# test section
def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests=[], answers=[], test_data=""):
with open(CSV_DATA, 'w') as f: # clear file
f.write('')
def test_select(
table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests=[],
answers=[],
test_data="",
):
with open(CSV_DATA, "w") as f: # clear file
f.write("")
if test_data:
with open(CSV_DATA, 'w') as f:
with open(CSV_DATA, "w") as f:
f.write(test_data + "\n")
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR))
get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for i in range(len(requests)):
tbl = table_name
if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema)
tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests[i].format(tbl=tbl), answers[i])
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]):
with open(CSV_DATA, 'w') as f: # flush test file
f.write('')
def test_insert(
table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests_insert=[],
requests_select=[],
answers=[],
):
with open(CSV_DATA, "w") as f: # flush test file
f.write("")
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR))
get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for req in requests_insert:
tbl = table_name
if not tbl:
tbl = "table function url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema)
tbl = "table function url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
get_ch_answer(req.format(tbl=tbl))
for i in range(len(requests_select)):
tbl = table_name
if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema)
tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests_select[i].format(tbl=tbl), answers[i])
if table_name:
@ -185,9 +232,11 @@ def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,do
def main():
test_data = "Hello,2,-2,7.7\nWorld,2,-5,8.8"
select_only_requests = {
"select str,numuint,numint,double from {tbl}" : test_data.replace(',', '\t'),
"select numuint, count(*) from {tbl} group by numuint" : "2\t2",
"select str,numuint,numint,double from {tbl} limit 1": test_data.split("\n")[0].replace(',', '\t'),
"select str,numuint,numint,double from {tbl}": test_data.replace(",", "\t"),
"select numuint, count(*) from {tbl} group by numuint": "2\t2",
"select str,numuint,numint,double from {tbl} limit 1": test_data.split("\n")[
0
].replace(",", "\t"),
}
insert_requests = [
@ -196,21 +245,41 @@ def main():
]
select_requests = {
"select distinct numuint from {tbl} order by numuint": '\n'.join([str(i) for i in range(11)]),
"select count(*) from {tbl}": '12',
'select double, count(*) from {tbl} group by double order by double': "7.7\t2\n9.9\t10"
"select distinct numuint from {tbl} order by numuint": "\n".join(
[str(i) for i in range(11)]
),
"select count(*) from {tbl}": "12",
"select double, count(*) from {tbl} group by double order by double": "7.7\t2\n9.9\t10",
}
t, httpd = start_server()
t.start()
# test table with url engine
test_select(table_name="test_table_select", requests=list(select_only_requests.keys()), answers=list(select_only_requests.values()), test_data=test_data)
test_select(
table_name="test_table_select",
requests=list(select_only_requests.keys()),
answers=list(select_only_requests.values()),
test_data=test_data,
)
# test table function url
test_select(requests=list(select_only_requests.keys()), answers=list(select_only_requests.values()), test_data=test_data)
#test insert into table with url engine
test_insert(table_name="test_table_insert", requests_insert=insert_requests, requests_select=list(select_requests.keys()), answers=list(select_requests.values()))
#test insert into table function url
test_insert(requests_insert=insert_requests, requests_select=list(select_requests.keys()), answers=list(select_requests.values()))
test_select(
requests=list(select_only_requests.keys()),
answers=list(select_only_requests.values()),
test_data=test_data,
)
# test insert into table with url engine
test_insert(
table_name="test_table_insert",
requests_insert=insert_requests,
requests_select=list(select_requests.keys()),
answers=list(select_requests.values()),
)
# test insert into table function url
test_insert(
requests_insert=insert_requests,
requests_select=list(select_requests.keys()),
answers=list(select_requests.values()),
)
httpd.shutdown()
t.join()

View File

@ -12,35 +12,46 @@ HAYSTACKS = [
NEEDLE = "needle"
HAY_RE = re.compile(r'\bhay\b', re.IGNORECASE)
NEEDLE_RE = re.compile(r'\bneedle\b', re.IGNORECASE)
HAY_RE = re.compile(r"\bhay\b", re.IGNORECASE)
NEEDLE_RE = re.compile(r"\bneedle\b", re.IGNORECASE)
def replace_follow_case(replacement):
def func(match):
g = match.group()
if g.islower(): return replacement.lower()
if g.istitle(): return replacement.title()
if g.isupper(): return replacement.upper()
if g.islower():
return replacement.lower()
if g.istitle():
return replacement.title()
if g.isupper():
return replacement.upper()
return replacement
return func
def replace_separators(query, new_sep):
SEP_RE = re.compile('\\s+')
SEP_RE = re.compile("\\s+")
result = SEP_RE.sub(new_sep, query)
return result
def enlarge_haystack(query, times, separator=''):
return HAY_RE.sub(replace_follow_case(('hay' + separator) * times), query)
def enlarge_haystack(query, times, separator=""):
return HAY_RE.sub(replace_follow_case(("hay" + separator) * times), query)
def small_needle(query):
return NEEDLE_RE.sub(replace_follow_case('n'), query)
return NEEDLE_RE.sub(replace_follow_case("n"), query)
def remove_needle(query):
return NEEDLE_RE.sub('', query)
return NEEDLE_RE.sub("", query)
def replace_needle(query, new_needle):
return NEEDLE_RE.sub(new_needle, query)
# with str.lower, str.uppert, str.title and such
def transform_needle(query, string_transformation_func):
def replace_with_transformation(match):
@ -49,19 +60,21 @@ def transform_needle(query, string_transformation_func):
return NEEDLE_RE.sub(replace_with_transformation, query)
def create_cases(case_sensitive_func, case_insensitive_func, table_row_template, table_query_template, const_query_template):
def create_cases(
case_sensitive_func,
case_insensitive_func,
table_row_template,
table_query_template,
const_query_template,
):
const_queries = []
table_rows = []
table_queries = set()
def add_case(func, haystack, needle, match):
match = int(match)
args = dict(
func = func,
haystack = haystack,
needle = needle,
match = match
)
args = dict(func=func, haystack=haystack, needle=needle, match=match)
const_queries.append(const_query_template.substitute(args))
table_queries.add(table_query_template.substitute(args))
table_rows.append(table_row_template.substitute(args))
@ -69,14 +82,28 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
def add_case_sensitive(haystack, needle, match):
add_case(case_sensitive_func, haystack, needle, match)
if match:
add_case(case_sensitive_func, transform_needle(haystack, str.swapcase), transform_needle(needle, str.swapcase), match)
add_case(
case_sensitive_func,
transform_needle(haystack, str.swapcase),
transform_needle(needle, str.swapcase),
match,
)
def add_case_insensitive(haystack, needle, match):
add_case(case_insensitive_func, haystack, needle, match)
if match:
add_case(case_insensitive_func, transform_needle(haystack, str.swapcase), needle, match)
add_case(case_insensitive_func, haystack, transform_needle(needle, str.swapcase), match)
add_case(
case_insensitive_func,
transform_needle(haystack, str.swapcase),
needle,
match,
)
add_case(
case_insensitive_func,
haystack,
transform_needle(needle, str.swapcase),
match,
)
# Negative cases
add_case_sensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False)
@ -85,7 +112,7 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
for haystack in HAYSTACKS:
add_case_sensitive(transform_needle(haystack, str.swapcase), NEEDLE, False)
sep = ''
sep = ""
h = replace_separators(haystack, sep)
add_case_sensitive(h, NEEDLE, False)
@ -102,8 +129,7 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
add_case_sensitive(haystack, NEEDLE, True)
add_case_insensitive(haystack, NEEDLE, True)
for sep in list(''' ,'''):
for sep in list(""" ,"""):
h = replace_separators(haystack, sep)
add_case_sensitive(h, NEEDLE, True)
add_case_sensitive(small_needle(h), small_needle(NEEDLE), True)
@ -114,32 +140,43 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
add_case_insensitive(enlarge_haystack(h, 200, sep), NEEDLE, True)
# case insesitivity works only on ASCII strings
add_case_sensitive(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True)
add_case_sensitive(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True)
add_case_sensitive(
replace_needle(h, "иголка"), replace_needle(NEEDLE, "иголка"), True
)
add_case_sensitive(
replace_needle(h, "指针"), replace_needle(NEEDLE, "指针"), True
)
for sep in list('''~!@$%^&*()-=+|]}[{";:/?.><\t''') + [r'\\\\']:
for sep in list("""~!@$%^&*()-=+|]}[{";:/?.><\t""") + [r"\\\\"]:
h = replace_separators(HAYSTACKS[0], sep)
add_case(case_sensitive_func, h, NEEDLE, True)
return table_rows, table_queries, const_queries
def main():
def main():
def query(x):
print(x)
CONST_QUERY = Template("""SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};""")
TABLE_QUERY = Template("""WITH '${needle}' as n
CONST_QUERY = Template(
"""SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};"""
)
TABLE_QUERY = Template(
"""WITH '${needle}' as n
SELECT haystack, needle, ${func}(haystack, n) as result
FROM ht
WHERE func = '${func}' AND needle = n AND result != match;""")
WHERE func = '${func}' AND needle = n AND result != match;"""
)
TABLE_ROW = Template("""('${haystack}', '${needle}', ${match}, '${func}')""")
rows, table_queries, const_queries = create_cases('hasToken', 'hasTokenCaseInsensitive', TABLE_ROW, TABLE_QUERY, CONST_QUERY)
rows, table_queries, const_queries = create_cases(
"hasToken", "hasTokenCaseInsensitive", TABLE_ROW, TABLE_QUERY, CONST_QUERY
)
for q in const_queries:
query(q)
query("""DROP TABLE IF EXISTS ht;
query(
"""DROP TABLE IF EXISTS ht;
CREATE TABLE IF NOT EXISTS
ht
(
@ -150,11 +187,15 @@ def main():
)
ENGINE MergeTree()
ORDER BY haystack;
INSERT INTO ht VALUES {values};""".format(values=", ".join(rows)))
INSERT INTO ht VALUES {values};""".format(
values=", ".join(rows)
)
)
for q in sorted(table_queries):
query(q)
query("""DROP TABLE ht""")
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -8,28 +8,32 @@ import sys
import signal
CLICKHOUSE_CLIENT = os.environ.get('CLICKHOUSE_CLIENT')
CLICKHOUSE_CURL = os.environ.get('CLICKHOUSE_CURL')
CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL')
CLICKHOUSE_CLIENT = os.environ.get("CLICKHOUSE_CLIENT")
CLICKHOUSE_CURL = os.environ.get("CLICKHOUSE_CURL")
CLICKHOUSE_URL = os.environ.get("CLICKHOUSE_URL")
def send_query(query):
cmd = list(CLICKHOUSE_CLIENT.split())
cmd += ['--query', query]
cmd += ["--query", query]
# print(cmd)
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout
return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).stdout
def send_query_in_process_group(query):
cmd = list(CLICKHOUSE_CLIENT.split())
cmd += ['--query', query]
cmd += ["--query", query]
# print(cmd)
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid)
return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid
)
def read_lines_and_push_to_queue(pipe, queue):
try:
for line in iter(pipe.readline, ''):
for line in iter(pipe.readline, ""):
line = line.strip()
print(line)
sys.stdout.flush()
@ -41,41 +45,44 @@ def read_lines_and_push_to_queue(pipe, queue):
def test():
send_query('DROP TABLE IF EXISTS test.lv').read()
send_query('DROP TABLE IF EXISTS test.mt').read()
send_query('CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()').read()
send_query('CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt').read()
send_query("DROP TABLE IF EXISTS test.lv").read()
send_query("DROP TABLE IF EXISTS test.mt").read()
send_query(
"CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()"
).read()
send_query("CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt").read()
q = queue.Queue()
p = send_query_in_process_group('WATCH test.lv')
p = send_query_in_process_group("WATCH test.lv")
thread = threading.Thread(target=read_lines_and_push_to_queue, args=(p.stdout, q))
thread.start()
line = q.get()
print(line)
assert (line == '0\t1')
assert line == "0\t1"
send_query('INSERT INTO test.mt VALUES (1),(2),(3)').read()
send_query("INSERT INTO test.mt VALUES (1),(2),(3)").read()
line = q.get()
print(line)
assert (line == '6\t2')
assert line == "6\t2"
send_query('INSERT INTO test.mt VALUES (4),(5),(6)').read()
send_query("INSERT INTO test.mt VALUES (4),(5),(6)").read()
line = q.get()
print(line)
assert (line == '21\t3')
assert line == "21\t3"
# Send Ctrl+C to client.
os.killpg(os.getpgid(p.pid), signal.SIGINT)
# This insert shouldn't affect lv.
send_query('INSERT INTO test.mt VALUES (7),(8),(9)').read()
send_query("INSERT INTO test.mt VALUES (7),(8),(9)").read()
line = q.get()
print(line)
assert (line is None)
assert line is None
send_query('DROP TABLE if exists test.lv').read()
send_query('DROP TABLE if exists test.lv').read()
send_query("DROP TABLE if exists test.lv").read()
send_query("DROP TABLE if exists test.lv").read()
thread.join()
test()

View File

@ -7,26 +7,30 @@ import os
import sys
CLICKHOUSE_CLIENT = os.environ.get('CLICKHOUSE_CLIENT')
CLICKHOUSE_CURL = os.environ.get('CLICKHOUSE_CURL')
CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL')
CLICKHOUSE_CLIENT = os.environ.get("CLICKHOUSE_CLIENT")
CLICKHOUSE_CURL = os.environ.get("CLICKHOUSE_CURL")
CLICKHOUSE_URL = os.environ.get("CLICKHOUSE_URL")
def send_query(query):
cmd = list(CLICKHOUSE_CLIENT.split())
cmd += ['--query', query]
cmd += ["--query", query]
# print(cmd)
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout
return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).stdout
def send_http_query(query):
cmd = list(CLICKHOUSE_CURL.split()) # list(['curl', '-sSN', '--max-time', '10'])
cmd += ['-sSN', CLICKHOUSE_URL, '-d', query]
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout
cmd = list(CLICKHOUSE_CURL.split()) # list(['curl', '-sSN', '--max-time', '10'])
cmd += ["-sSN", CLICKHOUSE_URL, "-d", query]
return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).stdout
def read_lines_and_push_to_queue(pipe, queue):
for line in iter(pipe.readline, ''):
for line in iter(pipe.readline, ""):
line = line.strip()
print(line)
sys.stdout.flush()
@ -36,28 +40,31 @@ def read_lines_and_push_to_queue(pipe, queue):
def test():
send_query('DROP TABLE IF EXISTS test.lv').read()
send_query('DROP TABLE IF EXISTS test.mt').read()
send_query('CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()').read()
send_query('CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt').read()
send_query("DROP TABLE IF EXISTS test.lv").read()
send_query("DROP TABLE IF EXISTS test.mt").read()
send_query(
"CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()"
).read()
send_query("CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt").read()
q = queue.Queue()
pipe = send_http_query('WATCH test.lv')
pipe = send_http_query("WATCH test.lv")
thread = threading.Thread(target=read_lines_and_push_to_queue, args=(pipe, q))
thread.start()
line = q.get()
print(line)
assert (line == '0\t1')
assert line == "0\t1"
send_query('INSERT INTO test.mt VALUES (1),(2),(3)').read()
send_query("INSERT INTO test.mt VALUES (1),(2),(3)").read()
line = q.get()
print(line)
assert (line == '6\t2')
assert line == "6\t2"
send_query('DROP TABLE if exists test.lv').read()
send_query('DROP TABLE if exists test.lv').read()
send_query("DROP TABLE if exists test.lv").read()
send_query("DROP TABLE if exists test.lv").read()
thread.join()
test()

View File

@ -6,69 +6,85 @@ import pandas as pd
import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value, precision=1e-2):
client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS ttest;")
client.query("CREATE TABLE ttest (left Float64, right UInt8) ENGINE = Memory;");
client.query("INSERT INTO ttest VALUES {};".format(", ".join(['({},{})'.format(i, 0) for i in a])))
client.query("INSERT INTO ttest VALUES {};".format(", ".join(['({},{})'.format(j, 1) for j in b])))
client.query("CREATE TABLE ttest (left Float64, right UInt8) ENGINE = Memory;")
client.query(
"INSERT INTO ttest VALUES {};".format(
", ".join(["({},{})".format(i, 0) for i in a])
)
)
client.query(
"INSERT INTO ttest VALUES {};".format(
", ".join(["({},{})".format(j, 1) for j in b])
)
)
real = client.query_return_df(
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) +
"roundBankers({}(left, right).2, 16) as p_value ".format(name) +
"FROM ttest FORMAT TabSeparatedWithNames;")
real_t_stat = real['t_stat'][0]
real_p_value = real['p_value'][0]
assert(abs(real_t_stat - np.float64(t_stat)) < precision), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
assert(abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
+ "roundBankers({}(left, right).2, 16) as p_value ".format(name)
+ "FROM ttest FORMAT TabSeparatedWithNames;"
)
real_t_stat = real["t_stat"][0]
real_p_value = real["p_value"][0]
assert (
abs(real_t_stat - np.float64(t_stat)) < precision
), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
assert (
abs(real_p_value - np.float64(p_value)) < precision
), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
client.query("DROP TABLE IF EXISTS ttest;")
def test_student():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10,size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20,size=1024), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10, size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20, size=1024), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10,size=512), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=512), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p)
def test_welch():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=15,size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False)
rvs1 = np.round(stats.norm.rvs(loc=1, scale=15, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=7,size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=3,size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=7, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=3, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=1,size=512), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=1, size=512), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=5, scale=10,size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=10,size=1024), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False)
rvs1 = np.round(stats.norm.rvs(loc=5, scale=10, size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=10, size=1024), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p)
if __name__ == "__main__":
test_student()
test_welch()

View File

@ -6,7 +6,7 @@ import pandas as pd
import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
@ -14,40 +14,51 @@ from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS mann_whitney;")
client.query("CREATE TABLE mann_whitney (left Float64, right UInt8) ENGINE = Memory;");
client.query("INSERT INTO mann_whitney VALUES {};".format(", ".join(['({},{}), ({},{})'.format(i, 0, j, 1) for i,j in zip(a, b)])))
client.query(
"CREATE TABLE mann_whitney (left Float64, right UInt8) ENGINE = Memory;"
)
client.query(
"INSERT INTO mann_whitney VALUES {};".format(
", ".join(["({},{}), ({},{})".format(i, 0, j, 1) for i, j in zip(a, b)])
)
)
real = client.query_return_df(
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) +
"roundBankers({}(left, right).2, 16) as p_value ".format(name) +
"FROM mann_whitney FORMAT TabSeparatedWithNames;")
real_t_stat = real['t_stat'][0]
real_p_value = real['p_value'][0]
assert(abs(real_t_stat - np.float64(t_stat) < 1e-2)), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
assert(abs(real_p_value - np.float64(p_value)) < 1e-2), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
+ "roundBankers({}(left, right).2, 16) as p_value ".format(name)
+ "FROM mann_whitney FORMAT TabSeparatedWithNames;"
)
real_t_stat = real["t_stat"][0]
real_p_value = real["p_value"][0]
assert abs(
real_t_stat - np.float64(t_stat) < 1e-2
), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
assert (
abs(real_p_value - np.float64(p_value)) < 1e-2
), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
client.query("DROP TABLE IF EXISTS mann_whitney;")
def test_mann_whitney():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 5)
rvs2 = np.round(stats.expon.rvs(scale=0.2,size=500), 5)
s, p = stats.mannwhitneyu(rvs1, rvs2, alternative='two-sided')
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 5)
rvs2 = np.round(stats.expon.rvs(scale=0.2, size=500), 5)
s, p = stats.mannwhitneyu(rvs1, rvs2, alternative="two-sided")
test_and_check("mannWhitneyUTest", rvs1, rvs2, s, p)
test_and_check("mannWhitneyUTest('two-sided')", rvs1, rvs2, s, p)
equal = np.round(stats.cauchy.rvs(scale=5, size=500), 5)
s, p = stats.mannwhitneyu(equal, equal, alternative='two-sided')
s, p = stats.mannwhitneyu(equal, equal, alternative="two-sided")
test_and_check("mannWhitneyUTest('two-sided')", equal, equal, s, p)
s, p = stats.mannwhitneyu(equal, equal, alternative='less', use_continuity=False)
s, p = stats.mannwhitneyu(equal, equal, alternative="less", use_continuity=False)
test_and_check("mannWhitneyUTest('less', 0)", equal, equal, s, p)
rvs1 = np.round(stats.cauchy.rvs(scale=10,size=65536), 5)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10,size=65536), 5)
s, p = stats.mannwhitneyu(rvs1, rvs2, alternative='greater')
rvs1 = np.round(stats.cauchy.rvs(scale=10, size=65536), 5)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=65536), 5)
s, p = stats.mannwhitneyu(rvs1, rvs2, alternative="greater")
test_and_check("mannWhitneyUTest('greater')", rvs1, rvs2, s, p)
if __name__ == "__main__":
test_mann_whitney()
print("Ok.")

View File

@ -4,14 +4,18 @@ from random import randint, choices
import sys
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
client = ClickHouseClient()
N = 10
create_query = "CREATE TABLE t_cnf_fuzz(" + ", ".join([f"c{i} UInt8" for i in range(N)]) + ") ENGINE = Memory"
create_query = (
"CREATE TABLE t_cnf_fuzz("
+ ", ".join([f"c{i} UInt8" for i in range(N)])
+ ") ENGINE = Memory"
)
client.query("DROP TABLE IF EXISTS t_cnf_fuzz")
client.query(create_query)
@ -35,6 +39,7 @@ client.query(insert_query)
MAX_CLAUSES = 10
MAX_ATOMS = 5
def generate_dnf():
clauses = []
num_clauses = randint(1, MAX_CLAUSES)
@ -42,12 +47,17 @@ def generate_dnf():
num_atoms = randint(1, MAX_ATOMS)
atom_ids = choices(range(N), k=num_atoms)
negates = choices([0, 1], k=num_atoms)
atoms = [f"(NOT c{i})" if neg else f"c{i}" for (i, neg) in zip(atom_ids, negates)]
atoms = [
f"(NOT c{i})" if neg else f"c{i}" for (i, neg) in zip(atom_ids, negates)
]
clauses.append("(" + " AND ".join(atoms) + ")")
return " OR ".join(clauses)
select_query = "SELECT count() FROM t_cnf_fuzz WHERE {} SETTINGS convert_query_to_cnf = {}"
select_query = (
"SELECT count() FROM t_cnf_fuzz WHERE {} SETTINGS convert_query_to_cnf = {}"
)
fail_report = """
Failed query: '{}'.

View File

@ -5,15 +5,20 @@ import random
import string
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
def get_random_string(length):
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length))
return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(length)
)
client = ClickHouseClient()
def insert_block(table_name, block_granularity_rows, block_rows):
global client
block_data = []
@ -25,9 +30,12 @@ def insert_block(table_name, block_granularity_rows, block_rows):
values_row = ", ".join("(1, '" + row + "')" for row in block_data)
client.query("INSERT INTO {} VALUES {}".format(table_name, values_row))
try:
client.query("DROP TABLE IF EXISTS t")
client.query("CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0")
client.query(
"CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0"
)
client.query("SYSTEM STOP MERGES t")
@ -53,6 +61,10 @@ try:
client.query("SYSTEM START MERGES t")
client.query("OPTIMIZE TABLE t FINAL")
print(client.query_return_df("SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames")['C'][0])
print(
client.query_return_df(
"SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames"
)["C"][0]
)
finally:
client.query("DROP TABLE IF EXISTS t")

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from http.server import SimpleHTTPRequestHandler,HTTPServer
from http.server import SimpleHTTPRequestHandler, HTTPServer
import socket
import csv
import sys
@ -21,6 +21,7 @@ def is_ipv6(host):
except:
return True
def get_local_port(host, ipv6):
if ipv6:
family = socket.AF_INET6
@ -31,8 +32,9 @@ def get_local_port(host, ipv6):
fd.bind((host, 0))
return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', 'localhost')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "localhost")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
#####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -42,16 +44,24 @@ CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
#####################################################################################
# IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0]
HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
# Because we need to check the content of file.csv we can create this content and avoid reading csv
CSV_DATA = "Hello, 1\nWorld, 2\nThis, 152\nis, 9283\ntesting, 2313213\ndata, 555\n"
@ -59,19 +69,24 @@ CSV_DATA = "Hello, 1\nWorld, 2\nThis, 152\nis, 9283\ntesting, 2313213\ndata, 555
# Choose compression method
# (Will change during test, need to check standard data sending, to make sure that nothing broke)
COMPRESS_METHOD = 'none'
ADDING_ENDING = ''
ENDINGS = ['.gz', '.xz']
COMPRESS_METHOD = "none"
ADDING_ENDING = ""
ENDINGS = [".gz", ".xz"]
SEND_ENCODING = True
def get_ch_answer(query):
host = CLICKHOUSE_HOST
if IS_IPV6:
host = f'[{host}]'
host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP))
url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer):
ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip():
@ -80,18 +95,19 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query")
# Server with head method which is useful for debuging by hands
class HttpProcessor(SimpleHTTPRequestHandler):
def _set_headers(self):
self.send_response(200)
if SEND_ENCODING:
self.send_header('Content-Encoding', COMPRESS_METHOD)
if COMPRESS_METHOD == 'none':
self.send_header('Content-Length', len(CSV_DATA.encode()))
self.send_header("Content-Encoding", COMPRESS_METHOD)
if COMPRESS_METHOD == "none":
self.send_header("Content-Length", len(CSV_DATA.encode()))
else:
self.compress_data()
self.send_header('Content-Length', len(self.data))
self.send_header('Content-Type', 'text/csv')
self.send_header("Content-Length", len(self.data))
self.send_header("Content-Type", "text/csv")
self.end_headers()
def do_HEAD(self):
@ -99,18 +115,17 @@ class HttpProcessor(SimpleHTTPRequestHandler):
return
def compress_data(self):
if COMPRESS_METHOD == 'gzip':
if COMPRESS_METHOD == "gzip":
self.data = gzip.compress((CSV_DATA).encode())
elif COMPRESS_METHOD == 'lzma':
elif COMPRESS_METHOD == "lzma":
self.data = lzma.compress((CSV_DATA).encode())
else:
self.data = 'WRONG CONVERSATION'.encode()
self.data = "WRONG CONVERSATION".encode()
def do_GET(self):
self._set_headers()
if COMPRESS_METHOD == 'none':
if COMPRESS_METHOD == "none":
self.wfile.write(CSV_DATA.encode())
else:
self.wfile.write(self.data)
@ -119,9 +134,11 @@ class HttpProcessor(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
return
class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6
def start_server(requests_amount):
if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor)
@ -135,52 +152,60 @@ def start_server(requests_amount):
t = threading.Thread(target=real_func)
return t
#####################################################################
# Testing area.
#####################################################################
def test_select(dict_name="", schema="word String, counter UInt32", requests=[], answers=[], test_data=""):
def test_select(
dict_name="",
schema="word String, counter UInt32",
requests=[],
answers=[],
test_data="",
):
global ADDING_ENDING
global SEND_ENCODING
global COMPRESS_METHOD
for i in range(len(requests)):
if i > 2:
ADDING_ENDING = ENDINGS[i-3]
ADDING_ENDING = ENDINGS[i - 3]
SEND_ENCODING = False
if dict_name:
get_ch_answer("drop dictionary if exists {}".format(dict_name))
get_ch_answer('''CREATE DICTIONARY {} ({})
get_ch_answer(
"""CREATE DICTIONARY {} ({})
PRIMARY KEY word
SOURCE(HTTP(url '{}' format 'CSV'))
LAYOUT(complex_key_hashed())
LIFETIME(0)'''.format(dict_name, schema, HTTP_SERVER_URL_STR + '/test.csv' + ADDING_ENDING))
LIFETIME(0)""".format(
dict_name, schema, HTTP_SERVER_URL_STR + "/test.csv" + ADDING_ENDING
)
)
COMPRESS_METHOD = requests[i]
print(i, COMPRESS_METHOD, ADDING_ENDING, SEND_ENCODING)
check_answers("SELECT * FROM {} ORDER BY word".format(dict_name), answers[i])
def main():
# first three for encoding, second three for url
insert_requests = [
'none',
'gzip',
'lzma',
'gzip',
'lzma'
]
insert_requests = ["none", "gzip", "lzma", "gzip", "lzma"]
# This answers got experemently in non compressed mode and they are correct
answers = ['''Hello 1\nThis 152\nWorld 2\ndata 555\nis 9283\ntesting 2313213'''] * 5
answers = ["""Hello 1\nThis 152\nWorld 2\ndata 555\nis 9283\ntesting 2313213"""] * 5
t = start_server(len(insert_requests))
t.start()
test_select(dict_name="test_table_select", requests=insert_requests, answers=answers)
test_select(
dict_name="test_table_select", requests=insert_requests, answers=answers
)
t.join()
print("PASSED")
if __name__ == "__main__":
try:
main()
@ -191,5 +216,3 @@ if __name__ == "__main__":
sys.stderr.flush()
os._exit(1)

View File

@ -5,9 +5,10 @@ import socket
import os
import uuid
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT = int(os.environ.get('CLICKHOUSE_PORT_TCP', '900000'))
CLICKHOUSE_DATABASE = os.environ.get('CLICKHOUSE_DATABASE', 'default')
CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT = int(os.environ.get("CLICKHOUSE_PORT_TCP", "900000"))
CLICKHOUSE_DATABASE = os.environ.get("CLICKHOUSE_DATABASE", "default")
def writeVarUInt(x, ba):
for _ in range(0, 9):
@ -24,12 +25,12 @@ def writeVarUInt(x, ba):
def writeStringBinary(s, ba):
b = bytes(s, 'utf-8')
b = bytes(s, "utf-8")
writeVarUInt(len(s), ba)
ba.extend(b)
def readStrict(s, size = 1):
def readStrict(s, size=1):
res = bytearray()
while size:
cur = s.recv(size)
@ -48,18 +49,23 @@ def readUInt(s, size=1):
val += res[i] << (i * 8)
return val
def readUInt8(s):
return readUInt(s)
def readUInt16(s):
return readUInt(s, 2)
def readUInt32(s):
return readUInt(s, 4)
def readUInt64(s):
return readUInt(s, 8)
def readVarUInt(s):
x = 0
for i in range(9):
@ -75,25 +81,25 @@ def readVarUInt(s):
def readStringBinary(s):
size = readVarUInt(s)
s = readStrict(s, size)
return s.decode('utf-8')
return s.decode("utf-8")
def sendHello(s):
ba = bytearray()
writeVarUInt(0, ba) # Hello
writeStringBinary('simple native protocol', ba)
writeVarUInt(0, ba) # Hello
writeStringBinary("simple native protocol", ba)
writeVarUInt(21, ba)
writeVarUInt(9, ba)
writeVarUInt(54449, ba)
writeStringBinary('default', ba) # database
writeStringBinary('default', ba) # user
writeStringBinary('', ba) # pwd
writeStringBinary("default", ba) # database
writeStringBinary("default", ba) # user
writeStringBinary("", ba) # pwd
s.sendall(ba)
def receiveHello(s):
p_type = readVarUInt(s)
assert (p_type == 0) # Hello
assert p_type == 0 # Hello
server_name = readStringBinary(s)
# print("Server name: ", server_name)
server_version_major = readVarUInt(s)
@ -111,78 +117,79 @@ def receiveHello(s):
def serializeClientInfo(ba, query_id):
writeStringBinary('default', ba) # initial_user
writeStringBinary(query_id, ba) # initial_query_id
writeStringBinary('127.0.0.1:9000', ba) # initial_address
ba.extend([0] * 8) # initial_query_start_time_microseconds
ba.append(1) # TCP
writeStringBinary('os_user', ba) # os_user
writeStringBinary('client_hostname', ba) # client_hostname
writeStringBinary('client_name', ba) # client_name
writeStringBinary("default", ba) # initial_user
writeStringBinary(query_id, ba) # initial_query_id
writeStringBinary("127.0.0.1:9000", ba) # initial_address
ba.extend([0] * 8) # initial_query_start_time_microseconds
ba.append(1) # TCP
writeStringBinary("os_user", ba) # os_user
writeStringBinary("client_hostname", ba) # client_hostname
writeStringBinary("client_name", ba) # client_name
writeVarUInt(21, ba)
writeVarUInt(9, ba)
writeVarUInt(54449, ba)
writeStringBinary('', ba) # quota_key
writeVarUInt(0, ba) # distributed_depth
writeVarUInt(1, ba) # client_version_patch
ba.append(0) # No telemetry
writeStringBinary("", ba) # quota_key
writeVarUInt(0, ba) # distributed_depth
writeVarUInt(1, ba) # client_version_patch
ba.append(0) # No telemetry
def sendQuery(s, query):
ba = bytearray()
query_id = uuid.uuid4().hex
writeVarUInt(1, ba) # query
writeVarUInt(1, ba) # query
writeStringBinary(query_id, ba)
ba.append(1) # INITIAL_QUERY
ba.append(1) # INITIAL_QUERY
# client info
serializeClientInfo(ba, query_id)
writeStringBinary('', ba) # No settings
writeStringBinary('', ba) # No interserver secret
writeVarUInt(2, ba) # Stage - Complete
ba.append(0) # No compression
writeStringBinary(query, ba) # query, finally
writeStringBinary("", ba) # No settings
writeStringBinary("", ba) # No interserver secret
writeVarUInt(2, ba) # Stage - Complete
ba.append(0) # No compression
writeStringBinary(query, ba) # query, finally
s.sendall(ba)
def serializeBlockInfo(ba):
writeVarUInt(1, ba) # 1
ba.append(0) # is_overflows
writeVarUInt(2, ba) # 2
writeVarUInt(0, ba) # 0
ba.extend([0] * 4) # bucket_num
writeVarUInt(1, ba) # 1
ba.append(0) # is_overflows
writeVarUInt(2, ba) # 2
writeVarUInt(0, ba) # 0
ba.extend([0] * 4) # bucket_num
def sendEmptyBlock(s):
ba = bytearray()
writeVarUInt(2, ba) # Data
writeStringBinary('', ba)
writeVarUInt(2, ba) # Data
writeStringBinary("", ba)
serializeBlockInfo(ba)
writeVarUInt(0, ba) # rows
writeVarUInt(0, ba) # columns
writeVarUInt(0, ba) # rows
writeVarUInt(0, ba) # columns
s.sendall(ba)
def assertPacket(packet, expected):
assert(packet == expected), packet
assert packet == expected, packet
def readHeader(s):
packet_type = readVarUInt(s)
if packet_type == 2: # Exception
if packet_type == 2: # Exception
raise RuntimeError(readException(s))
assertPacket(packet_type, 1) # Data
assertPacket(packet_type, 1) # Data
readStringBinary(s) # external table name
readStringBinary(s) # external table name
# BlockInfo
assertPacket(readVarUInt(s), 1) # 1
assertPacket(readUInt8(s), 0) # is_overflows
assertPacket(readVarUInt(s), 2) # 2
assertPacket(readUInt32(s), 4294967295) # bucket_num
assertPacket(readVarUInt(s), 0) # 0
columns = readVarUInt(s) # rows
rows = readVarUInt(s) # columns
assertPacket(readVarUInt(s), 1) # 1
assertPacket(readUInt8(s), 0) # is_overflows
assertPacket(readVarUInt(s), 2) # 2
assertPacket(readUInt32(s), 4294967295) # bucket_num
assertPacket(readVarUInt(s), 0) # 0
columns = readVarUInt(s) # rows
rows = readVarUInt(s) # columns
print("Rows {} Columns {}".format(rows, columns))
for _ in range(columns):
col_name = readStringBinary(s)
@ -194,9 +201,9 @@ def readException(s):
code = readUInt32(s)
name = readStringBinary(s)
text = readStringBinary(s)
readStringBinary(s) # trace
assertPacket(readUInt8(s), 0) # has_nested
return "code {}: {}".format(code, text.replace('DB::Exception:', ''))
readStringBinary(s) # trace
assertPacket(readUInt8(s), 0) # has_nested
return "code {}: {}".format(code, text.replace("DB::Exception:", ""))
def insertValidLowCardinalityRow():
@ -205,7 +212,12 @@ def insertValidLowCardinalityRow():
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s)
receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE))
sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables
sendEmptyBlock(s)
@ -213,25 +225,27 @@ def insertValidLowCardinalityRow():
# Data
ba = bytearray()
writeVarUInt(2, ba) # Data
writeStringBinary('', ba)
writeVarUInt(2, ba) # Data
writeStringBinary("", ba)
serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary('x', ba)
writeStringBinary('LowCardinality(String)', ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 2] + [0] * 6) # indexes type: UInt64 [3], with additional keys [2]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary('hello', ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (0 for 'hello')
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary("x", ba)
writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend(
[3, 2] + [0] * 6
) # indexes type: UInt64 [3], with additional keys [2]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (0 for 'hello')
s.sendall(ba)
# Fin block
sendEmptyBlock(s)
assertPacket(readVarUInt(s), 5) # End of stream
assertPacket(readVarUInt(s), 5) # End of stream
s.close()
@ -241,7 +255,12 @@ def insertLowCardinalityRowWithIndexOverflow():
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s)
receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE))
sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables
sendEmptyBlock(s)
@ -249,19 +268,21 @@ def insertLowCardinalityRowWithIndexOverflow():
# Data
ba = bytearray()
writeVarUInt(2, ba) # Data
writeStringBinary('', ba)
writeVarUInt(2, ba) # Data
writeStringBinary("", ba)
serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary('x', ba)
writeStringBinary('LowCardinality(String)', ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 2] + [0] * 6) # indexes type: UInt64 [3], with additional keys [2]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary('hello', ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 7 + [1]) # UInt64 index (overflow)
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary("x", ba)
writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend(
[3, 2] + [0] * 6
) # indexes type: UInt64 [3], with additional keys [2]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 7 + [1]) # UInt64 index (overflow)
s.sendall(ba)
assertPacket(readVarUInt(s), 2)
@ -275,7 +296,12 @@ def insertLowCardinalityRowWithIncorrectDictType():
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s)
receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE))
sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables
sendEmptyBlock(s)
@ -283,32 +309,40 @@ def insertLowCardinalityRowWithIncorrectDictType():
# Data
ba = bytearray()
writeVarUInt(2, ba) # Data
writeStringBinary('', ba)
writeVarUInt(2, ba) # Data
writeStringBinary("", ba)
serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary('x', ba)
writeStringBinary('LowCardinality(String)', ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 3] + [0] * 6) # indexes type: UInt64 [3], with global dict and add keys [1 + 2]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary('hello', ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (overflow)
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary("x", ba)
writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend(
[3, 3] + [0] * 6
) # indexes type: UInt64 [3], with global dict and add keys [1 + 2]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (overflow)
s.sendall(ba)
assertPacket(readVarUInt(s), 2)
print(readException(s))
s.close()
def insertLowCardinalityRowWithIncorrectAdditionalKeys():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(30)
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s)
receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE))
sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables
sendEmptyBlock(s)
@ -316,30 +350,34 @@ def insertLowCardinalityRowWithIncorrectAdditionalKeys():
# Data
ba = bytearray()
writeVarUInt(2, ba) # Data
writeStringBinary('', ba)
writeVarUInt(2, ba) # Data
writeStringBinary("", ba)
serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary('x', ba)
writeStringBinary('LowCardinality(String)', ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 0] + [0] * 6) # indexes type: UInt64 [3], with NO additional keys [0]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary('hello', ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (0 for 'hello')
writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns
writeStringBinary("x", ba)
writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend(
[3, 0] + [0] * 6
) # indexes type: UInt64 [3], with NO additional keys [0]
ba.extend([1] + [0] * 7) # num_keys in dict
writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (0 for 'hello')
s.sendall(ba)
assertPacket(readVarUInt(s), 2)
print(readException(s))
s.close()
def main():
insertValidLowCardinalityRow()
insertLowCardinalityRowWithIndexOverflow()
insertLowCardinalityRowWithIncorrectDictType()
insertLowCardinalityRowWithIncorrectAdditionalKeys()
if __name__ == "__main__":
main()

View File

@ -12,6 +12,7 @@ import subprocess
from io import StringIO
from http.server import BaseHTTPRequestHandler, HTTPServer
def is_ipv6(host):
try:
socket.inet_aton(host)
@ -19,6 +20,7 @@ def is_ipv6(host):
except:
return True
def get_local_port(host, ipv6):
if ipv6:
family = socket.AF_INET6
@ -29,8 +31,9 @@ def get_local_port(host, ipv6):
fd.bind((host, 0))
return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
#####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -39,27 +42,42 @@ CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
#####################################################################################
# IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0]
HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
CSV_DATA = os.path.join(
tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
)
CSV_DATA = os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
def get_ch_answer(query):
host = CLICKHOUSE_HOST
if IS_IPV6:
host = f'[{host}]'
host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP))
url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer):
ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip():
@ -68,15 +86,16 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query")
class CSVHTTPServer(BaseHTTPRequestHandler):
def _set_headers(self):
self.send_response(200)
self.send_header('Content-type', 'text/csv')
self.send_header("Content-type", "text/csv")
self.end_headers()
def do_GET(self):
self._set_headers()
self.wfile.write(('hello, world').encode())
self.wfile.write(("hello, world").encode())
# with open(CSV_DATA, 'r') as fl:
# reader = csv.reader(fl, delimiter=',')
# for row in reader:
@ -84,33 +103,33 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
return
def read_chunk(self):
msg = ''
msg = ""
while True:
sym = self.rfile.read(1)
if sym == '':
if sym == "":
break
msg += sym.decode('utf-8')
if msg.endswith('\r\n'):
msg += sym.decode("utf-8")
if msg.endswith("\r\n"):
break
length = int(msg[:-2], 16)
if length == 0:
return ''
return ""
content = self.rfile.read(length)
self.rfile.read(2) # read sep \r\n
return content.decode('utf-8')
self.rfile.read(2) # read sep \r\n
return content.decode("utf-8")
def do_POST(self):
data = ''
data = ""
while True:
chunk = self.read_chunk()
if not chunk:
break
data += chunk
with StringIO(data) as fl:
reader = csv.reader(fl, delimiter=',')
with open(CSV_DATA, 'a') as d:
reader = csv.reader(fl, delimiter=",")
with open(CSV_DATA, "a") as d:
for row in reader:
d.write(','.join(row) + '\n')
d.write(",".join(row) + "\n")
self._set_headers()
self.wfile.write(b"ok")
@ -121,6 +140,7 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6
def start_server():
if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, CSVHTTPServer)
@ -130,57 +150,87 @@ def start_server():
t = threading.Thread(target=httpd.serve_forever)
return t, httpd
# test section
def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests=[], answers=[], test_data=""):
with open(CSV_DATA, 'w') as f: # clear file
f.write('')
def test_select(
table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests=[],
answers=[],
test_data="",
):
with open(CSV_DATA, "w") as f: # clear file
f.write("")
if test_data:
with open(CSV_DATA, 'w') as f:
with open(CSV_DATA, "w") as f:
f.write(test_data + "\n")
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR))
get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for i in range(len(requests)):
tbl = table_name
if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema)
tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests[i].format(tbl=tbl), answers[i])
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]):
with open(CSV_DATA, 'w') as f: # flush test file
f.write('')
def test_insert(
table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests_insert=[],
requests_select=[],
answers=[],
):
with open(CSV_DATA, "w") as f: # flush test file
f.write("")
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR))
get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for req in requests_insert:
tbl = table_name
if not tbl:
tbl = "table function url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema)
tbl = "table function url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
get_ch_answer(req.format(tbl=tbl))
for i in range(len(requests_select)):
tbl = table_name
if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema)
tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests_select[i].format(tbl=tbl), answers[i])
if table_name:
get_ch_answer("drop table if exists {}".format(table_name))
def test_select_url_engine(requests=[], answers=[], test_data=""):
for i in range(len(requests)):
check_answers(requests[i], answers[i])
def main():
test_data = "Hello,2,-2,7.7\nWorld,2,-5,8.8"
"""
@ -203,19 +253,29 @@ def main():
"""
if IS_IPV6:
query = "select * from url('http://guest:guest@" + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/', 'RawBLOB', 'a String')"
query = (
"select * from url('http://guest:guest@"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/', 'RawBLOB', 'a String')"
)
else:
query = "select * from url('http://guest:guest@" + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/', 'RawBLOB', 'a String')"
query = (
"select * from url('http://guest:guest@"
+ f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/', 'RawBLOB', 'a String')"
)
select_requests_url_auth = {
query : 'hello, world',
query: "hello, world",
}
t, httpd = start_server()
t.start()
test_select(requests=list(select_requests_url_auth.keys()), answers=list(select_requests_url_auth.values()), test_data=test_data)
test_select(
requests=list(select_requests_url_auth.keys()),
answers=list(select_requests_url_auth.values()),
test_data=test_data,
)
httpd.shutdown()
t.join()
print("PASSED")

View File

@ -8,7 +8,7 @@ import pandas as pd
import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
@ -25,7 +25,7 @@ def twosample_proportion_ztest(s1, s2, t1, t2, alpha):
return nan, nan, nan, nan
z_stat = (p1 - p2) / se
one_side = 1 - stats.norm.cdf(abs(z_stat))
one_side = 1 - stats.norm.cdf(abs(z_stat))
p_value = one_side * 2
z = stats.norm.ppf(1 - 0.5 * alpha)
@ -38,71 +38,171 @@ def twosample_proportion_ztest(s1, s2, t1, t2, alpha):
def test_and_check(name, z_stat, p_value, ci_lower, ci_upper, precision=1e-2):
client = ClickHouseClient()
real = client.query_return_df(
"SELECT roundBankers({}.1, 16) as z_stat, ".format(name) +
"roundBankers({}.2, 16) as p_value, ".format(name) +
"roundBankers({}.3, 16) as ci_lower, ".format(name) +
"roundBankers({}.4, 16) as ci_upper ".format(name) +
"FORMAT TabSeparatedWithNames;")
real_z_stat = real['z_stat'][0]
real_p_value = real['p_value'][0]
real_ci_lower = real['ci_lower'][0]
real_ci_upper = real['ci_upper'][0]
assert((np.isnan(real_z_stat) and np.isnan(z_stat)) or abs(real_z_stat - np.float64(z_stat)) < precision), "clickhouse_z_stat {}, py_z_stat {}".format(real_z_stat, z_stat)
assert((np.isnan(real_p_value) and np.isnan(p_value)) or abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
assert((np.isnan(real_ci_lower) and np.isnan(ci_lower)) or abs(real_ci_lower - np.float64(ci_lower)) < precision), "clickhouse_ci_lower {}, py_ci_lower {}".format(real_ci_lower, ci_lower)
assert((np.isnan(real_ci_upper) and np.isnan(ci_upper)) or abs(real_ci_upper - np.float64(ci_upper)) < precision), "clickhouse_ci_upper {}, py_ci_upper {}".format(real_ci_upper, ci_upper)
"SELECT roundBankers({}.1, 16) as z_stat, ".format(name)
+ "roundBankers({}.2, 16) as p_value, ".format(name)
+ "roundBankers({}.3, 16) as ci_lower, ".format(name)
+ "roundBankers({}.4, 16) as ci_upper ".format(name)
+ "FORMAT TabSeparatedWithNames;"
)
real_z_stat = real["z_stat"][0]
real_p_value = real["p_value"][0]
real_ci_lower = real["ci_lower"][0]
real_ci_upper = real["ci_upper"][0]
assert (np.isnan(real_z_stat) and np.isnan(z_stat)) or abs(
real_z_stat - np.float64(z_stat)
) < precision, "clickhouse_z_stat {}, py_z_stat {}".format(real_z_stat, z_stat)
assert (np.isnan(real_p_value) and np.isnan(p_value)) or abs(
real_p_value - np.float64(p_value)
) < precision, "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
assert (np.isnan(real_ci_lower) and np.isnan(ci_lower)) or abs(
real_ci_lower - np.float64(ci_lower)
) < precision, "clickhouse_ci_lower {}, py_ci_lower {}".format(
real_ci_lower, ci_lower
)
assert (np.isnan(real_ci_upper) and np.isnan(ci_upper)) or abs(
real_ci_upper - np.float64(ci_upper)
) < precision, "clickhouse_ci_upper {}, py_ci_upper {}".format(
real_ci_upper, ci_upper
)
def test_mean_ztest():
counts = [0, 0]
nobs = [0, 0]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(10, 10, 10, 10, 0.05)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
10, 10, 10, 10, 0.05
)
counts = [10, 10]
nobs = [10, 10]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(10, 10, 10, 10, 0.05)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
10, 10, 10, 10, 0.05
)
counts = [16, 16]
nobs = [16, 18]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [10, 20]
nobs = [30, 40]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [20, 10]
nobs = [40, 30]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(10,20), randrange(10,20)]
nobs = [randrange(counts[0] + 1, counts[0] * 2), randrange(counts[1], counts[1] * 2)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
counts = [randrange(10, 20), randrange(10, 20)]
nobs = [
randrange(counts[0] + 1, counts[0] * 2),
randrange(counts[1], counts[1] * 2),
]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(1,100), randrange(1,200)]
counts = [randrange(1, 100), randrange(1, 200)]
nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 3)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(1,200), randrange(1,100)]
counts = [randrange(1, 200), randrange(1, 100)]
nobs = [randrange(counts[0], counts[0] * 3), randrange(counts[1], counts[1] * 2)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(1,1000), randrange(1,1000)]
counts = [randrange(1, 1000), randrange(1, 1000)]
nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 2)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05)
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
if __name__ == "__main__":
test_mean_ztest()
print("Ok.")

View File

@ -7,7 +7,7 @@ import pandas as pd
import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
@ -30,46 +30,95 @@ def twosample_mean_ztest(rvs1, rvs2, alpha=0.05):
def test_and_check(name, a, b, t_stat, p_value, ci_low, ci_high, precision=1e-2):
client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS ztest;")
client.query("CREATE TABLE ztest (left Float64, right UInt8) ENGINE = Memory;");
client.query("INSERT INTO ztest VALUES {};".format(", ".join(['({},{})'.format(i, 0) for i in a])))
client.query("INSERT INTO ztest VALUES {};".format(", ".join(['({},{})'.format(j, 1) for j in b])))
client.query("CREATE TABLE ztest (left Float64, right UInt8) ENGINE = Memory;")
client.query(
"INSERT INTO ztest VALUES {};".format(
", ".join(["({},{})".format(i, 0) for i in a])
)
)
client.query(
"INSERT INTO ztest VALUES {};".format(
", ".join(["({},{})".format(j, 1) for j in b])
)
)
real = client.query_return_df(
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) +
"roundBankers({}(left, right).2, 16) as p_value, ".format(name) +
"roundBankers({}(left, right).3, 16) as ci_low, ".format(name) +
"roundBankers({}(left, right).4, 16) as ci_high ".format(name) +
"FROM ztest FORMAT TabSeparatedWithNames;")
real_t_stat = real['t_stat'][0]
real_p_value = real['p_value'][0]
real_ci_low = real['ci_low'][0]
real_ci_high = real['ci_high'][0]
assert(abs(real_t_stat - np.float64(t_stat)) < precision), "clickhouse_t_stat {}, py_t_stat {}".format(real_t_stat, t_stat)
assert(abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
assert(abs(real_ci_low - np.float64(ci_low)) < precision), "clickhouse_ci_low {}, py_ci_low {}".format(real_ci_low, ci_low)
assert(abs(real_ci_high - np.float64(ci_high)) < precision), "clickhouse_ci_high {}, py_ci_high {}".format(real_ci_high, ci_high)
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
+ "roundBankers({}(left, right).2, 16) as p_value, ".format(name)
+ "roundBankers({}(left, right).3, 16) as ci_low, ".format(name)
+ "roundBankers({}(left, right).4, 16) as ci_high ".format(name)
+ "FROM ztest FORMAT TabSeparatedWithNames;"
)
real_t_stat = real["t_stat"][0]
real_p_value = real["p_value"][0]
real_ci_low = real["ci_low"][0]
real_ci_high = real["ci_high"][0]
assert (
abs(real_t_stat - np.float64(t_stat)) < precision
), "clickhouse_t_stat {}, py_t_stat {}".format(real_t_stat, t_stat)
assert (
abs(real_p_value - np.float64(p_value)) < precision
), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
assert (
abs(real_ci_low - np.float64(ci_low)) < precision
), "clickhouse_ci_low {}, py_ci_low {}".format(real_ci_low, ci_low)
assert (
abs(real_ci_high - np.float64(ci_high)) < precision
), "clickhouse_ci_high {}, py_ci_high {}".format(real_ci_high, ci_high)
client.query("DROP TABLE IF EXISTS ztest;")
def test_mean_ztest():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 2)
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch)
test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch)
test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10,size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20,size=1024), 2)
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10, size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20, size=1024), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch)
test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10,size=512), 2)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=512), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch)
test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
if __name__ == "__main__":

View File

@ -3,47 +3,71 @@ import os
import sys
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL')
CLICKHOUSE_TMP = os.environ.get('CLICKHOUSE_TMP')
CLICKHOUSE_URL = os.environ.get("CLICKHOUSE_URL")
CLICKHOUSE_TMP = os.environ.get("CLICKHOUSE_TMP")
from pure_http_client import ClickHouseClient
client = ClickHouseClient()
def run_test(data_format, gen_data_template, settings):
print(data_format)
client.query("TRUNCATE TABLE t_async_insert")
expected = client.query(gen_data_template.format("TSV")).strip()
data = client.query(gen_data_template.format(data_format), settings=settings,binary_result=True)
data = client.query(
gen_data_template.format(data_format), settings=settings, binary_result=True
)
insert_query = "INSERT INTO t_async_insert FORMAT {}".format(data_format)
client.query_with_data(insert_query, data, settings=settings)
result = client.query("SELECT * FROM t_async_insert FORMAT TSV").strip()
if result != expected:
print("Failed for format {}.\nExpected:\n{}\nGot:\n{}\n".format(data_format, expected, result))
print(
"Failed for format {}.\nExpected:\n{}\nGot:\n{}\n".format(
data_format, expected, result
)
)
exit(1)
formats = client.query("SELECT name FROM system.formats WHERE is_input AND is_output \
AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf', 'ProtobufList') ORDER BY name").strip().split('\n')
formats = (
client.query(
"SELECT name FROM system.formats WHERE is_input AND is_output \
AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf', 'ProtobufList') ORDER BY name"
)
.strip()
.split("\n")
)
# Generic formats
client.query("DROP TABLE IF EXISTS t_async_insert")
client.query("CREATE TABLE t_async_insert (id UInt64, s String, arr Array(UInt64)) ENGINE = Memory")
client.query(
"CREATE TABLE t_async_insert (id UInt64, s String, arr Array(UInt64)) ENGINE = Memory"
)
gen_data_query = "SELECT number AS id, toString(number) AS s, range(number) AS arr FROM numbers(10) FORMAT {}"
for data_format in formats:
run_test(data_format, gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1})
run_test(
data_format,
gen_data_query,
settings={"async_insert": 1, "wait_for_async_insert": 1},
)
# LineAsString
client.query("DROP TABLE IF EXISTS t_async_insert")
client.query("CREATE TABLE t_async_insert (s String) ENGINE = Memory")
gen_data_query = "SELECT toString(number) AS s FROM numbers(10) FORMAT {}"
run_test('LineAsString', gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1})
run_test(
"LineAsString",
gen_data_query,
settings={"async_insert": 1, "wait_for_async_insert": 1},
)
# TODO: add CapnProto and Protobuf

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3
from http.server import SimpleHTTPRequestHandler,HTTPServer
from http.server import SimpleHTTPRequestHandler, HTTPServer
import socket
import sys
import threading
@ -17,6 +17,7 @@ def is_ipv6(host):
except:
return True
def get_local_port(host, ipv6):
if ipv6:
family = socket.AF_INET6
@ -27,20 +28,19 @@ def get_local_port(host, ipv6):
fd.bind((host, 0))
return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', 'localhost')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "localhost")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
# Server returns this JSON response.
SERVER_JSON_RESPONSE = \
'''{
SERVER_JSON_RESPONSE = """{
"login": "ClickHouse",
"id": 54801242,
"name": "ClickHouse",
"company": null
}'''
}"""
EXPECTED_ANSWER = \
'''{\\n\\t"login": "ClickHouse",\\n\\t"id": 54801242,\\n\\t"name": "ClickHouse",\\n\\t"company": null\\n}'''
EXPECTED_ANSWER = """{\\n\\t"login": "ClickHouse",\\n\\t"id": 54801242,\\n\\t"name": "ClickHouse",\\n\\t"company": null\\n}"""
#####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -51,26 +51,38 @@ EXPECTED_ANSWER = \
#####################################################################################
# IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0]
HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/"
HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
def get_ch_answer(query):
host = CLICKHOUSE_HOST
if IS_IPV6:
host = f'[{host}]'
host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP))
url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer):
ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip():
@ -79,16 +91,17 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query")
# Server with check for User-Agent headers.
class HttpProcessor(SimpleHTTPRequestHandler):
def _set_headers(self):
user_agent = self.headers.get('User-Agent')
if user_agent and user_agent.startswith('ClickHouse/'):
user_agent = self.headers.get("User-Agent")
if user_agent and user_agent.startswith("ClickHouse/"):
self.send_response(200)
else:
self.send_response(403)
self.send_header('Content-Type', 'text/csv')
self.send_header("Content-Type", "text/csv")
self.end_headers()
def do_GET(self):
@ -98,9 +111,11 @@ class HttpProcessor(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
return
class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6
def start_server(requests_amount):
if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor)
@ -114,15 +129,18 @@ def start_server(requests_amount):
t = threading.Thread(target=real_func)
return t
#####################################################################
# Testing area.
#####################################################################
def test_select():
global HTTP_SERVER_URL_STR
query = 'SELECT * FROM url(\'{}\',\'JSONAsString\');'.format(HTTP_SERVER_URL_STR)
query = "SELECT * FROM url('{}','JSONAsString');".format(HTTP_SERVER_URL_STR)
check_answers(query, EXPECTED_ANSWER)
def main():
# HEAD + GET
t = start_server(3)
@ -131,6 +149,7 @@ def main():
t.join()
print("PASSED")
if __name__ == "__main__":
try:
main()
@ -141,4 +160,3 @@ if __name__ == "__main__":
sys.stderr.flush()
os._exit(1)

View File

@ -122,7 +122,7 @@ class HttpProcessor(BaseHTTPRequestHandler):
get_call_num = 0
responses_to_get = []
def send_head(self, from_get = False):
def send_head(self, from_get=False):
if self.headers["Range"] and HttpProcessor.allow_range:
try:
self.range = parse_byte_range(self.headers["Range"])
@ -146,7 +146,9 @@ class HttpProcessor(BaseHTTPRequestHandler):
self.send_error(416, "Requested Range Not Satisfiable")
return None
retry_range_request = first != 0 and from_get is True and len(HttpProcessor.responses_to_get) > 0
retry_range_request = (
first != 0 and from_get is True and len(HttpProcessor.responses_to_get) > 0
)
if retry_range_request:
code = HttpProcessor.responses_to_get.pop()
if code not in HttpProcessor.responses:
@ -244,7 +246,9 @@ def run_test(allow_range, settings, check_retries=False):
raise Exception("HTTP Range was not used when supported")
if check_retries and len(HttpProcessor.responses_to_get) > 0:
raise Exception("Expected to get http response 500, which had to be retried, but 200 ok returned and then retried")
raise Exception(
"Expected to get http response 500, which had to be retried, but 200 ok returned and then retried"
)
if retries_num > 0:
expected_get_call_num += retries_num - 1
@ -263,7 +267,7 @@ def run_test(allow_range, settings, check_retries=False):
def main():
settings = {"max_download_buffer_size" : 20}
settings = {"max_download_buffer_size": 20}
# Test Accept-Ranges=False
run_test(allow_range=False, settings=settings)
@ -271,7 +275,7 @@ def main():
run_test(allow_range=True, settings=settings)
# Test Accept-Ranges=True, parallel download is used
settings = {"max_download_buffer_size" : 10}
settings = {"max_download_buffer_size": 10}
run_test(allow_range=True, settings=settings)
# Test Accept-Ranges=True, parallel download is not used,

View File

@ -7,7 +7,7 @@ import pandas as pd
import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
@ -22,15 +22,22 @@ def test_and_check(rvs, n_groups, f_stat, p_value, precision=1e-2):
client.query("DROP TABLE IF EXISTS anova;")
client.query("CREATE TABLE anova (left Float64, right UInt64) ENGINE = Memory;")
for group in range(n_groups):
client.query(f'''INSERT INTO anova VALUES {", ".join([f'({i},{group})' for i in rvs[group]])};''')
client.query(
f"""INSERT INTO anova VALUES {", ".join([f'({i},{group})' for i in rvs[group]])};"""
)
real = client.query_return_df(
'''SELECT roundBankers(a.1, 16) as f_stat, roundBankers(a.2, 16) as p_value FROM (SELECT anova(left, right) as a FROM anova) FORMAT TabSeparatedWithNames;''')
"""SELECT roundBankers(a.1, 16) as f_stat, roundBankers(a.2, 16) as p_value FROM (SELECT anova(left, right) as a FROM anova) FORMAT TabSeparatedWithNames;"""
)
real_f_stat = real['f_stat'][0]
real_p_value = real['p_value'][0]
assert(abs(real_f_stat - np.float64(f_stat)) < precision), f"clickhouse_f_stat {real_f_stat}, py_f_stat {f_stat}"
assert(abs(real_p_value - np.float64(p_value)) < precision), f"clickhouse_p_value {real_p_value}, py_p_value {p_value}"
real_f_stat = real["f_stat"][0]
real_p_value = real["p_value"][0]
assert (
abs(real_f_stat - np.float64(f_stat)) < precision
), f"clickhouse_f_stat {real_f_stat}, py_f_stat {f_stat}"
assert (
abs(real_p_value - np.float64(p_value)) < precision
), f"clickhouse_p_value {real_p_value}, py_p_value {p_value}"
client.query("DROP TABLE IF EXISTS anova;")

View File

@ -123,10 +123,14 @@ Uses FinishSortingTransform: {}
for query in queries:
check_query(query["where"], query["order_by"], query["optimize"], False)
check_query(query["where"], query["order_by"] + ["e"], query["optimize"], query["optimize"])
check_query(
query["where"], query["order_by"] + ["e"], query["optimize"], query["optimize"]
)
where_columns = [f"bitNot({col})" for col in query["where"]]
check_query(where_columns, query["order_by"], query["optimize"], False)
check_query(where_columns, query["order_by"] + ["e"], query["optimize"], query["optimize"])
check_query(
where_columns, query["order_by"] + ["e"], query["optimize"], query["optimize"]
)
print("OK")

View File

@ -8,8 +8,8 @@ TRANSFER_ENCODING_HEADER = "Transfer-Encoding"
def main():
host = os.environ['CLICKHOUSE_HOST']
port = int(os.environ['CLICKHOUSE_PORT_HTTP'])
host = os.environ["CLICKHOUSE_HOST"]
port = int(os.environ["CLICKHOUSE_PORT_HTTP"])
sock = socket(AF_INET, SOCK_STREAM)
sock.connect((host, port))
@ -47,4 +47,3 @@ def main():
if __name__ == "__main__":
main()

View File

@ -5,9 +5,10 @@ import os
import uuid
import json
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT = int(os.environ.get('CLICKHOUSE_PORT_TCP', '900000'))
CLICKHOUSE_DATABASE = os.environ.get('CLICKHOUSE_DATABASE', 'default')
CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT = int(os.environ.get("CLICKHOUSE_PORT_TCP", "900000"))
CLICKHOUSE_DATABASE = os.environ.get("CLICKHOUSE_DATABASE", "default")
def writeVarUInt(x, ba):
for _ in range(0, 9):
@ -24,12 +25,12 @@ def writeVarUInt(x, ba):
def writeStringBinary(s, ba):
b = bytes(s, 'utf-8')
b = bytes(s, "utf-8")
writeVarUInt(len(s), ba)
ba.extend(b)
def readStrict(s, size = 1):
def readStrict(s, size=1):
res = bytearray()
while size:
cur = s.recv(size)
@ -48,18 +49,23 @@ def readUInt(s, size=1):
val += res[i] << (i * 8)
return val
def readUInt8(s):
return readUInt(s)
def readUInt16(s):
return readUInt(s, 2)
def readUInt32(s):
return readUInt(s, 4)
def readUInt64(s):
return readUInt(s, 8)
def readVarUInt(s):
x = 0
for i in range(9):
@ -75,25 +81,25 @@ def readVarUInt(s):
def readStringBinary(s):
size = readVarUInt(s)
s = readStrict(s, size)
return s.decode('utf-8')
return s.decode("utf-8")
def sendHello(s):
ba = bytearray()
writeVarUInt(0, ba) # Hello
writeStringBinary('simple native protocol', ba)
writeVarUInt(0, ba) # Hello
writeStringBinary("simple native protocol", ba)
writeVarUInt(21, ba)
writeVarUInt(9, ba)
writeVarUInt(54449, ba)
writeStringBinary(CLICKHOUSE_DATABASE, ba) # database
writeStringBinary('default', ba) # user
writeStringBinary('', ba) # pwd
writeStringBinary(CLICKHOUSE_DATABASE, ba) # database
writeStringBinary("default", ba) # user
writeStringBinary("", ba) # pwd
s.sendall(ba)
def receiveHello(s):
p_type = readVarUInt(s)
assert (p_type == 0) # Hello
assert p_type == 0 # Hello
server_name = readStringBinary(s)
# print("Server name: ", server_name)
server_version_major = readVarUInt(s)
@ -111,65 +117,65 @@ def receiveHello(s):
def serializeClientInfo(ba, query_id):
writeStringBinary('default', ba) # initial_user
writeStringBinary(query_id, ba) # initial_query_id
writeStringBinary('127.0.0.1:9000', ba) # initial_address
ba.extend([0] * 8) # initial_query_start_time_microseconds
ba.append(1) # TCP
writeStringBinary('os_user', ba) # os_user
writeStringBinary('client_hostname', ba) # client_hostname
writeStringBinary('client_name', ba) # client_name
writeStringBinary("default", ba) # initial_user
writeStringBinary(query_id, ba) # initial_query_id
writeStringBinary("127.0.0.1:9000", ba) # initial_address
ba.extend([0] * 8) # initial_query_start_time_microseconds
ba.append(1) # TCP
writeStringBinary("os_user", ba) # os_user
writeStringBinary("client_hostname", ba) # client_hostname
writeStringBinary("client_name", ba) # client_name
writeVarUInt(21, ba)
writeVarUInt(9, ba)
writeVarUInt(54449, ba)
writeStringBinary('', ba) # quota_key
writeVarUInt(0, ba) # distributed_depth
writeVarUInt(1, ba) # client_version_patch
ba.append(0) # No telemetry
writeStringBinary("", ba) # quota_key
writeVarUInt(0, ba) # distributed_depth
writeVarUInt(1, ba) # client_version_patch
ba.append(0) # No telemetry
def sendQuery(s, query):
ba = bytearray()
query_id = uuid.uuid4().hex
writeVarUInt(1, ba) # query
writeVarUInt(1, ba) # query
writeStringBinary(query_id, ba)
ba.append(1) # INITIAL_QUERY
ba.append(1) # INITIAL_QUERY
# client info
serializeClientInfo(ba, query_id)
writeStringBinary('', ba) # No settings
writeStringBinary('', ba) # No interserver secret
writeVarUInt(2, ba) # Stage - Complete
ba.append(0) # No compression
writeStringBinary(query, ba) # query, finally
writeStringBinary("", ba) # No settings
writeStringBinary("", ba) # No interserver secret
writeVarUInt(2, ba) # Stage - Complete
ba.append(0) # No compression
writeStringBinary(query, ba) # query, finally
s.sendall(ba)
def serializeBlockInfo(ba):
writeVarUInt(1, ba) # 1
ba.append(0) # is_overflows
writeVarUInt(2, ba) # 2
writeVarUInt(0, ba) # 0
ba.extend([0] * 4) # bucket_num
writeVarUInt(1, ba) # 1
ba.append(0) # is_overflows
writeVarUInt(2, ba) # 2
writeVarUInt(0, ba) # 0
ba.extend([0] * 4) # bucket_num
def sendEmptyBlock(s):
ba = bytearray()
writeVarUInt(2, ba) # Data
writeStringBinary('', ba)
writeVarUInt(2, ba) # Data
writeStringBinary("", ba)
serializeBlockInfo(ba)
writeVarUInt(0, ba) # rows
writeVarUInt(0, ba) # columns
writeVarUInt(0, ba) # rows
writeVarUInt(0, ba) # columns
s.sendall(ba)
def assertPacket(packet, expected):
assert(packet == expected), packet
assert packet == expected, packet
class Progress():
class Progress:
def __init__(self):
# NOTE: this is done in ctor to initialize __dict__
self.read_rows = 0
@ -198,11 +204,12 @@ class Progress():
def __bool__(self):
return (
self.read_rows > 0 or
self.read_bytes > 0 or
self.total_rows_to_read > 0 or
self.written_rows > 0 or
self.written_bytes > 0)
self.read_rows > 0
or self.read_bytes > 0
or self.total_rows_to_read > 0
or self.written_rows > 0
or self.written_bytes > 0
)
def readProgress(s):
@ -219,13 +226,14 @@ def readProgress(s):
progress.readPacket(s)
return progress
def readException(s):
code = readUInt32(s)
name = readStringBinary(s)
text = readStringBinary(s)
readStringBinary(s) # trace
assertPacket(readUInt8(s), 0) # has_nested
return "code {}: {}".format(code, text.replace('DB::Exception:', ''))
readStringBinary(s) # trace
assertPacket(readUInt8(s), 0) # has_nested
return "code {}: {}".format(code, text.replace("DB::Exception:", ""))
def main():
@ -236,7 +244,10 @@ def main():
receiveHello(s)
# For 1 second sleep and 1000ms of interactive_delay we definitelly should have non zero progress packet.
# NOTE: interactive_delay=0 cannot be used since in this case CompletedPipelineExecutor will not call cancelled callback.
sendQuery(s, "insert into function null('_ Int') select sleep(1) from numbers(2) settings max_block_size=1, interactive_delay=1000")
sendQuery(
s,
"insert into function null('_ Int') select sleep(1) from numbers(2) settings max_block_size=1, interactive_delay=1000",
)
# external tables
sendEmptyBlock(s)

View File

@ -4,18 +4,19 @@ import os
import sys
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
class Tester:
'''
"""
- Creates test table
- Deletes the specified range of rows
- Masks another range using row-level policy
- Runs some read queries and checks that the results
'''
"""
def __init__(self, session, url, index_granularity, total_rows):
self.session = session
self.url = url
@ -25,10 +26,10 @@ class Tester:
self.repro_queries = []
def report_error(self):
print('Repro steps:', '\n\n\t'.join(self.repro_queries))
print("Repro steps:", "\n\n\t".join(self.repro_queries))
exit(1)
def query(self, query_text, include_in_repro_steps = True, expected_data = None):
def query(self, query_text, include_in_repro_steps=True, expected_data=None):
self.repro_queries.append(query_text)
resp = self.session.post(self.url, data=query_text)
if resp.status_code != 200:
@ -36,113 +37,187 @@ class Tester:
error = resp.text[0:40]
if error not in self.reported_errors:
self.reported_errors.add(error)
print('Code:', resp.status_code)
print('Result:', resp.text)
print("Code:", resp.status_code)
print("Result:", resp.text)
self.report_error()
result = resp.text
# Check that the result is as expected
if ((not expected_data is None) and (int(result) != len(expected_data))):
print('Expected {} rows, got {}'.format(len(expected_data), result))
print('Expected data:' + str(expected_data))
if (not expected_data is None) and (int(result) != len(expected_data)):
print("Expected {} rows, got {}".format(len(expected_data), result))
print("Expected data:" + str(expected_data))
self.report_error()
if not include_in_repro_steps:
self.repro_queries.pop()
def check_data(self, all_data, delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end):
def check_data(
self,
all_data,
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
):
all_data_after_delete = all_data[
~((all_data.a == 0) &
(all_data.b > delete_range_start) &
(all_data.b <= delete_range_end))]
~(
(all_data.a == 0)
& (all_data.b > delete_range_start)
& (all_data.b <= delete_range_end)
)
]
all_data_after_row_policy = all_data_after_delete[
(all_data_after_delete.b <= row_level_policy_range_start) |
(all_data_after_delete.b > row_level_policy_range_end)]
(all_data_after_delete.b <= row_level_policy_range_start)
| (all_data_after_delete.b > row_level_policy_range_end)
]
for to_select in ['count()', 'sum(d)']: # Test reading with and without column with default value
self.query('SELECT {} FROM tab_02473;'.format(to_select), False, all_data_after_row_policy)
for to_select in [
"count()",
"sum(d)",
]: # Test reading with and without column with default value
self.query(
"SELECT {} FROM tab_02473;".format(to_select),
False,
all_data_after_row_policy,
)
delta = 10
for query_range_start in [0, delta]:
for query_range_end in [self.total_rows - delta]: #, self.total_rows]:
for query_range_end in [self.total_rows - delta]: # , self.total_rows]:
expected = all_data_after_row_policy[
(all_data_after_row_policy.a == 0) &
(all_data_after_row_policy.b > query_range_start) &
(all_data_after_row_policy.b <= query_range_end)]
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;'.format(
to_select, query_range_start, query_range_end), False, expected)
(all_data_after_row_policy.a == 0)
& (all_data_after_row_policy.b > query_range_start)
& (all_data_after_row_policy.b <= query_range_end)
]
self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;".format(
to_select, query_range_start, query_range_end
),
False,
expected,
)
expected = all_data_after_row_policy[
(all_data_after_row_policy.a == 0) &
(all_data_after_row_policy.c > query_range_start) &
(all_data_after_row_policy.c <= query_range_end)]
self.query('SELECT {} from tab_02473 PREWHERE c > {} AND c <= {} WHERE a == 0;'.format(
to_select, query_range_start, query_range_end), False, expected)
(all_data_after_row_policy.a == 0)
& (all_data_after_row_policy.c > query_range_start)
& (all_data_after_row_policy.c <= query_range_end)
]
self.query(
"SELECT {} from tab_02473 PREWHERE c > {} AND c <= {} WHERE a == 0;".format(
to_select, query_range_start, query_range_end
),
False,
expected,
)
expected = all_data_after_row_policy[
(all_data_after_row_policy.a == 0) &
((all_data_after_row_policy.c <= query_range_start) |
(all_data_after_row_policy.c > query_range_end))]
self.query('SELECT {} from tab_02473 PREWHERE c <= {} OR c > {} WHERE a == 0;'.format(
to_select, query_range_start, query_range_end), False, expected)
(all_data_after_row_policy.a == 0)
& (
(all_data_after_row_policy.c <= query_range_start)
| (all_data_after_row_policy.c > query_range_end)
)
]
self.query(
"SELECT {} from tab_02473 PREWHERE c <= {} OR c > {} WHERE a == 0;".format(
to_select, query_range_start, query_range_end
),
False,
expected,
)
def run_test(self, delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end):
def run_test(
self,
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
):
self.repro_queries = []
self.query('''
self.query(
"""
CREATE TABLE tab_02473 (a Int8, b Int32, c Int32, PRIMARY KEY (a))
ENGINE = MergeTree() ORDER BY (a, b)
SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};'''.format(self.index_granularity))
SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};""".format(
self.index_granularity
)
)
self.query('INSERT INTO tab_02473 select 0, number+1, number+1 FROM numbers({});'.format(self.total_rows))
self.query(
"INSERT INTO tab_02473 select 0, number+1, number+1 FROM numbers({});".format(
self.total_rows
)
)
client = ClickHouseClient()
all_data = client.query_return_df("SELECT a, b, c, 1 as d FROM tab_02473 FORMAT TabSeparatedWithNames;")
all_data = client.query_return_df(
"SELECT a, b, c, 1 as d FROM tab_02473 FORMAT TabSeparatedWithNames;"
)
self.query('OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;')
self.query("OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;")
# After all data has been written add a column with default value
self.query('ALTER TABLE tab_02473 ADD COLUMN d Int64 DEFAULT 1;')
self.query("ALTER TABLE tab_02473 ADD COLUMN d Int64 DEFAULT 1;")
self.check_data(all_data, -100, -100, -100, -100)
self.query('DELETE FROM tab_02473 WHERE a = 0 AND b > {} AND b <= {};'.format(
delete_range_start, delete_range_end))
self.query(
"DELETE FROM tab_02473 WHERE a = 0 AND b > {} AND b <= {};".format(
delete_range_start, delete_range_end
)
)
self.check_data(all_data, delete_range_start, delete_range_end, -100, -100)
self.query('CREATE ROW POLICY policy_tab_02473 ON tab_02473 FOR SELECT USING b <= {} OR b > {} TO default;'.format(
row_level_policy_range_start, row_level_policy_range_end))
self.query(
"CREATE ROW POLICY policy_tab_02473 ON tab_02473 FOR SELECT USING b <= {} OR b > {} TO default;".format(
row_level_policy_range_start, row_level_policy_range_end
)
)
self.check_data(all_data, delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end)
self.check_data(
all_data,
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
)
self.query('DROP POLICY policy_tab_02473 ON tab_02473;')
self.query('DROP TABLE tab_02473;')
self.query("DROP POLICY policy_tab_02473 ON tab_02473;")
self.query("DROP TABLE tab_02473;")
def main():
# Set mutations to synchronous mode and enable lightweight DELETE's
url = os.environ['CLICKHOUSE_URL'] + '&max_threads=1'
url = os.environ["CLICKHOUSE_URL"] + "&max_threads=1"
default_index_granularity = 10;
default_index_granularity = 10
total_rows = 8 * default_index_granularity
step = default_index_granularity
session = requests.Session()
for index_granularity in [default_index_granularity-1, default_index_granularity]: # [default_index_granularity-1, default_index_granularity+1, default_index_granularity]:
for index_granularity in [
default_index_granularity - 1,
default_index_granularity,
]: # [default_index_granularity-1, default_index_granularity+1, default_index_granularity]:
tester = Tester(session, url, index_granularity, total_rows)
# Test combinations of ranges of various size masked by lightweight DELETES
# along with ranges of various size masked by row-level policies
for delete_range_start in range(0, total_rows, 3 * step):
for delete_range_end in range(delete_range_start + 3 * step, total_rows, 2 * step):
for delete_range_end in range(
delete_range_start + 3 * step, total_rows, 2 * step
):
for row_level_policy_range_start in range(0, total_rows, 3 * step):
for row_level_policy_range_end in range(row_level_policy_range_start + 3 * step, total_rows, 2 * step):
tester.run_test(delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end)
for row_level_policy_range_end in range(
row_level_policy_range_start + 3 * step, total_rows, 2 * step
):
tester.run_test(
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
)
if __name__ == "__main__":
main()

View File

@ -4,16 +4,17 @@ import os
import sys
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
class Tester:
'''
"""
- Creates test table with multiple integer columns
- Runs read queries with multiple range conditions on different columns in PREWHERE and check that the result is correct
'''
"""
def __init__(self, session, url, index_granularity, total_rows):
self.session = session
self.url = url
@ -23,10 +24,10 @@ class Tester:
self.repro_queries = []
def report_error(self):
print('Repro steps:', '\n\n\t'.join(self.repro_queries))
print("Repro steps:", "\n\n\t".join(self.repro_queries))
exit(1)
def query(self, query_text, include_in_repro_steps = True, expected_data = None):
def query(self, query_text, include_in_repro_steps=True, expected_data=None):
self.repro_queries.append(query_text)
resp = self.session.post(self.url, data=query_text)
if resp.status_code != 200:
@ -34,98 +35,150 @@ class Tester:
error = resp.text[0:40]
if error not in self.reported_errors:
self.reported_errors.add(error)
print('Code:', resp.status_code)
print('Result:', resp.text)
print("Code:", resp.status_code)
print("Result:", resp.text)
self.report_error()
result = resp.text
# Check that the result is as expected
if ((not expected_data is None) and (int(result) != len(expected_data))):
print('Expected {} rows, got {}'.format(len(expected_data), result))
print('Expected data:' + str(expected_data))
if (not expected_data is None) and (int(result) != len(expected_data)):
print("Expected {} rows, got {}".format(len(expected_data), result))
print("Expected data:" + str(expected_data))
self.report_error()
if not include_in_repro_steps:
self.repro_queries.pop()
def check_data(self, all_data, c_range_start, c_range_end, d_range_start, d_range_end):
for to_select in ['count()', 'sum(e)']: # Test reading with and without column with default value
self.query('SELECT {} FROM tab_02473;'.format(to_select), False, all_data)
def check_data(
self, all_data, c_range_start, c_range_end, d_range_start, d_range_end
):
for to_select in [
"count()",
"sum(e)",
]: # Test reading with and without column with default value
self.query("SELECT {} FROM tab_02473;".format(to_select), False, all_data)
delta = 10
for b_range_start in [0, delta]:
for b_range_end in [self.total_rows - delta]: #, self.total_rows]:
for b_range_end in [self.total_rows - delta]: # , self.total_rows]:
expected = all_data[
(all_data.a == 0) &
(all_data.b > b_range_start) &
(all_data.b <= b_range_end)]
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;'.format(
to_select, b_range_start, b_range_end), False, expected)
(all_data.a == 0)
& (all_data.b > b_range_start)
& (all_data.b <= b_range_end)
]
self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;".format(
to_select, b_range_start, b_range_end
),
False,
expected,
)
expected = all_data[
(all_data.a == 0) &
(all_data.b > b_range_start) &
(all_data.b <= b_range_end) &
(all_data.c > c_range_start) &
(all_data.c <= c_range_end)]
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} WHERE a == 0;'.format(
to_select, b_range_start, b_range_end, c_range_start, c_range_end), False, expected)
(all_data.a == 0)
& (all_data.b > b_range_start)
& (all_data.b <= b_range_end)
& (all_data.c > c_range_start)
& (all_data.c <= c_range_end)
]
self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} WHERE a == 0;".format(
to_select,
b_range_start,
b_range_end,
c_range_start,
c_range_end,
),
False,
expected,
)
expected = all_data[
(all_data.a == 0) &
(all_data.b > b_range_start) &
(all_data.b <= b_range_end) &
(all_data.c > c_range_start) &
(all_data.c <= c_range_end) &
(all_data.d > d_range_start) &
(all_data.d <= d_range_end)]
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} AND d > {} AND d <= {} WHERE a == 0;'.format(
to_select, b_range_start, b_range_end, c_range_start, c_range_end, d_range_start, d_range_end), False, expected)
(all_data.a == 0)
& (all_data.b > b_range_start)
& (all_data.b <= b_range_end)
& (all_data.c > c_range_start)
& (all_data.c <= c_range_end)
& (all_data.d > d_range_start)
& (all_data.d <= d_range_end)
]
self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} AND d > {} AND d <= {} WHERE a == 0;".format(
to_select,
b_range_start,
b_range_end,
c_range_start,
c_range_end,
d_range_start,
d_range_end,
),
False,
expected,
)
def run_test(self, c_range_start, c_range_end, d_range_start, d_range_end):
self.repro_queries = []
self.query('''
self.query(
"""
CREATE TABLE tab_02473 (a Int8, b Int32, c Int32, d Int32, PRIMARY KEY (a))
ENGINE = MergeTree() ORDER BY (a, b)
SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};'''.format(self.index_granularity))
SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};""".format(
self.index_granularity
)
)
self.query('INSERT INTO tab_02473 select 0, number+1, number+1, number+1 FROM numbers({});'.format(self.total_rows))
self.query(
"INSERT INTO tab_02473 select 0, number+1, number+1, number+1 FROM numbers({});".format(
self.total_rows
)
)
client = ClickHouseClient()
all_data = client.query_return_df("SELECT a, b, c, d, 1 as e FROM tab_02473 FORMAT TabSeparatedWithNames;")
all_data = client.query_return_df(
"SELECT a, b, c, d, 1 as e FROM tab_02473 FORMAT TabSeparatedWithNames;"
)
self.query('OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;')
self.query("OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;")
# After all data has been written add a column with default value
self.query('ALTER TABLE tab_02473 ADD COLUMN e Int64 DEFAULT 1;')
self.query("ALTER TABLE tab_02473 ADD COLUMN e Int64 DEFAULT 1;")
self.check_data(all_data, c_range_start, c_range_end, d_range_start, d_range_end)
self.query('DROP TABLE tab_02473;')
self.check_data(
all_data, c_range_start, c_range_end, d_range_start, d_range_end
)
self.query("DROP TABLE tab_02473;")
def main():
# Enable multiple prewhere read steps
url = os.environ['CLICKHOUSE_URL'] + '&enable_multiple_prewhere_read_steps=1&move_all_conditions_to_prewhere=0&max_threads=1'
url = (
os.environ["CLICKHOUSE_URL"]
+ "&enable_multiple_prewhere_read_steps=1&move_all_conditions_to_prewhere=0&max_threads=1"
)
default_index_granularity = 10;
default_index_granularity = 10
total_rows = 8 * default_index_granularity
step = default_index_granularity
session = requests.Session()
for index_granularity in [default_index_granularity-1, default_index_granularity]:
for index_granularity in [default_index_granularity - 1, default_index_granularity]:
tester = Tester(session, url, index_granularity, total_rows)
# Test combinations of ranges of columns c and d
for c_range_start in range(0, total_rows, int(2.3 * step)):
for c_range_end in range(c_range_start + 3 * step, total_rows, int(2.1 * step)):
for d_range_start in range(int(0.5 * step), total_rows, int(2.7 * step)):
for d_range_end in range(d_range_start + 3 * step, total_rows, int(2.2 * step)):
tester.run_test(c_range_start, c_range_end, d_range_start, d_range_end)
for c_range_end in range(
c_range_start + 3 * step, total_rows, int(2.1 * step)
):
for d_range_start in range(
int(0.5 * step), total_rows, int(2.7 * step)
):
for d_range_end in range(
d_range_start + 3 * step, total_rows, int(2.2 * step)
):
tester.run_test(
c_range_start, c_range_end, d_range_start, d_range_end
)
if __name__ == "__main__":
main()

View File

@ -8,7 +8,7 @@ import time
from threading import Thread
CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient
@ -16,14 +16,23 @@ client = ClickHouseClient()
# test table without partition
client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY")
client.query('''
client.query(
"""
CREATE TABLE t_async_insert_dedup_no_part (
KeyID UInt32
) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
ORDER BY (KeyID)
''')
"""
)
client.query("insert into t_async_insert_dedup_no_part values (1), (2), (3), (4), (5)", settings = {"async_insert": 1, "wait_for_async_insert": 1, "insert_keeper_fault_injection_probability": 0})
client.query(
"insert into t_async_insert_dedup_no_part values (1), (2), (3), (4), (5)",
settings={
"async_insert": 1,
"wait_for_async_insert": 1,
"insert_keeper_fault_injection_probability": 0,
},
)
result = client.query("select count(*) from t_async_insert_dedup_no_part")
print(result, flush=True)
client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY")
@ -32,13 +41,13 @@ client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY")
def generate_data(q, total_number):
old_data = []
max_chunk_size = 30
partitions = ['2022-11-11 10:10:10', '2022-12-12 10:10:10']
partitions = ["2022-11-11 10:10:10", "2022-12-12 10:10:10"]
last_number = 0
while True:
dup_simulate = random.randint(0,3)
dup_simulate = random.randint(0, 3)
# insert old data randomly. 25% of them are dup.
if dup_simulate == 0:
last_idx = len(old_data)-1
last_idx = len(old_data) - 1
if last_idx < 0:
continue
idx = last_idx - random.randint(0, 50)
@ -53,7 +62,7 @@ def generate_data(q, total_number):
end = start + chunk_size
if end > total_number:
end = total_number
for i in range(start, end+1):
for i in range(start, end + 1):
partition = partitions[random.randint(0, 1)]
insert_stmt += "('{}', {}),".format(partition, i)
insert_stmt = insert_stmt[:-1]
@ -65,33 +74,46 @@ def generate_data(q, total_number):
# wait all the tasks is done.
q.join()
def fetch_and_insert_data(q, client):
while True:
insert = q.get()
client.query(insert, settings = {"async_insert": 1, "async_insert_deduplicate": 1, "wait_for_async_insert": 0, "async_insert_busy_timeout_ms": 1500, "insert_keeper_fault_injection_probability": 0})
client.query(
insert,
settings={
"async_insert": 1,
"async_insert_deduplicate": 1,
"wait_for_async_insert": 0,
"async_insert_busy_timeout_ms": 1500,
"insert_keeper_fault_injection_probability": 0,
},
)
q.task_done()
sleep_time = random.randint(50, 500)
time.sleep(sleep_time/1000.0)
time.sleep(sleep_time / 1000.0)
# main process
client.query("DROP TABLE IF EXISTS t_async_insert_dedup NO DELAY")
client.query('''
client.query(
"""
CREATE TABLE t_async_insert_dedup (
EventDate DateTime,
KeyID UInt32
) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
PARTITION BY toYYYYMM(EventDate)
ORDER BY (KeyID, EventDate) SETTINGS use_async_block_ids_cache = 1
''')
"""
)
q = queue.Queue(100)
total_number = 10000
gen = Thread(target = generate_data, args = [q, total_number])
gen = Thread(target=generate_data, args=[q, total_number])
gen.start()
for i in range(3):
insert = Thread(target = fetch_and_insert_data, args = [q, client])
insert = Thread(target=fetch_and_insert_data, args=[q, client])
insert.start()
gen.join()
@ -109,7 +131,7 @@ while True:
errMsg = f"the size of result is {len(result)}. we expect {total_number}."
else:
for i in range(total_number):
expect = str(i+1)
expect = str(i + 1)
real = result[i]
if expect != real:
err = True
@ -117,7 +139,7 @@ while True:
break
# retry several times to get stable results.
if err and retry >= 5:
print (errMsg, flush=True)
print(errMsg, flush=True)
elif err:
retry += 1
continue
@ -125,11 +147,15 @@ while True:
print(len(result), flush=True)
break
result = client.query("SELECT value FROM system.metrics where metric = 'AsyncInsertCacheSize'")
result = client.query(
"SELECT value FROM system.metrics where metric = 'AsyncInsertCacheSize'"
)
result = int(result.split()[0])
if result <= 0:
raise Exception(f"AsyncInsertCacheSize should > 0, but got {result}")
result = client.query("SELECT value FROM system.events where event = 'AsyncInsertCacheHits'")
result = client.query(
"SELECT value FROM system.events where event = 'AsyncInsertCacheHits'"
)
result = int(result.split()[0])
if result <= 0:
raise Exception(f"AsyncInsertCacheHits should > 0, but got {result}")

View File

@ -19,9 +19,9 @@ import tenacity
import xmltodict
import yaml
SELECT_VERSION = r'SELECT version()'
SELECT_VERSION = r"SELECT version()"
SELECT_UPTIME = r'''
SELECT_UPTIME = r"""
{% if version_ge('21.3') -%}
SELECT formatReadableTimeDelta(uptime())
{% else -%}
@ -29,18 +29,18 @@ SELECT
toString(floor(uptime() / 3600 / 24)) || ' days ' ||
toString(floor(uptime() % (24 * 3600) / 3600, 1)) || ' hours'
{% endif -%}
'''
"""
SELECT_SYSTEM_TABLES = "SELECT name FROM system.tables WHERE database = 'system'"
SELECT_DATABASE_ENGINES = r'''SELECT
SELECT_DATABASE_ENGINES = r"""SELECT
engine,
count() "count"
FROM system.databases
GROUP BY engine
'''
"""
SELECT_DATABASES = r'''SELECT
SELECT_DATABASES = r"""SELECT
name,
engine,
tables,
@ -62,17 +62,17 @@ LEFT JOIN
) AS db_stats ON db.name = db_stats.database
ORDER BY bytes_on_disk DESC
LIMIT 10
'''
"""
SELECT_TABLE_ENGINES = r'''SELECT
SELECT_TABLE_ENGINES = r"""SELECT
engine,
count() "count"
FROM system.tables
WHERE database != 'system'
GROUP BY engine
'''
"""
SELECT_DICTIONARIES = r'''SELECT
SELECT_DICTIONARIES = r"""SELECT
source,
type,
status,
@ -80,13 +80,13 @@ SELECT_DICTIONARIES = r'''SELECT
FROM system.dictionaries
GROUP BY source, type, status
ORDER BY status DESC, source
'''
"""
SELECT_ACCESS = "SHOW ACCESS"
SELECT_QUOTA_USAGE = "SHOW QUOTA"
SELECT_REPLICAS = r'''SELECT
SELECT_REPLICAS = r"""SELECT
database,
table,
is_leader,
@ -98,9 +98,9 @@ SELECT_REPLICAS = r'''SELECT
FROM system.replicas
ORDER BY absolute_delay DESC
LIMIT 10
'''
"""
SELECT_REPLICATION_QUEUE = r'''SELECT
SELECT_REPLICATION_QUEUE = r"""SELECT
database,
table,
replica_name,
@ -121,9 +121,9 @@ SELECT_REPLICATION_QUEUE = r'''SELECT
FROM system.replication_queue
ORDER BY create_time ASC
LIMIT 20
'''
"""
SELECT_REPLICATED_FETCHES = r'''SELECT
SELECT_REPLICATED_FETCHES = r"""SELECT
database,
table,
round(elapsed, 1) "elapsed",
@ -140,9 +140,9 @@ SELECT_REPLICATED_FETCHES = r'''SELECT
to_detached,
thread_id
FROM system.replicated_fetches
'''
"""
SELECT_PARTS_PER_TABLE = r'''SELECT
SELECT_PARTS_PER_TABLE = r"""SELECT
database,
table,
count() "partitions",
@ -162,9 +162,9 @@ FROM
GROUP BY database, table
ORDER BY max_parts_per_partition DESC
LIMIT 10
'''
"""
SELECT_MERGES = r'''SELECT
SELECT_MERGES = r"""SELECT
database,
table,
round(elapsed, 1) "elapsed",
@ -187,9 +187,9 @@ SELECT_MERGES = r'''SELECT
formatReadableSize(memory_usage) "memory_usage"
{% endif -%}
FROM system.merges
'''
"""
SELECT_MUTATIONS = r'''SELECT
SELECT_MUTATIONS = r"""SELECT
database,
table,
mutation_id,
@ -206,9 +206,9 @@ SELECT_MUTATIONS = r'''SELECT
FROM system.mutations
WHERE NOT is_done
ORDER BY create_time DESC
'''
"""
SELECT_RECENT_DATA_PARTS = r'''SELECT
SELECT_RECENT_DATA_PARTS = r"""SELECT
database,
table,
engine,
@ -242,9 +242,9 @@ SELECT_RECENT_DATA_PARTS = r'''SELECT
FROM system.parts
WHERE modification_time > now() - INTERVAL 3 MINUTE
ORDER BY modification_time DESC
'''
"""
SELECT_DETACHED_DATA_PARTS = r'''SELECT
SELECT_DETACHED_DATA_PARTS = r"""SELECT
database,
table,
partition_id,
@ -255,9 +255,9 @@ SELECT_DETACHED_DATA_PARTS = r'''SELECT
max_block_number,
level
FROM system.detached_parts
'''
"""
SELECT_PROCESSES = r'''SELECT
SELECT_PROCESSES = r"""SELECT
elapsed,
query_id,
{% if normalize_queries -%}
@ -285,9 +285,9 @@ SELECT_PROCESSES = r'''SELECT
{% endif -%}
FROM system.processes
ORDER BY elapsed DESC
'''
"""
SELECT_TOP_QUERIES_BY_DURATION = r'''SELECT
SELECT_TOP_QUERIES_BY_DURATION = r"""SELECT
type,
query_start_time,
query_duration_ms,
@ -339,9 +339,9 @@ WHERE type != 'QueryStart'
AND event_time >= now() - INTERVAL 1 DAY
ORDER BY query_duration_ms DESC
LIMIT 10
'''
"""
SELECT_TOP_QUERIES_BY_MEMORY_USAGE = r'''SELECT
SELECT_TOP_QUERIES_BY_MEMORY_USAGE = r"""SELECT
type,
query_start_time,
query_duration_ms,
@ -393,9 +393,9 @@ WHERE type != 'QueryStart'
AND event_time >= now() - INTERVAL 1 DAY
ORDER BY memory_usage DESC
LIMIT 10
'''
"""
SELECT_FAILED_QUERIES = r'''SELECT
SELECT_FAILED_QUERIES = r"""SELECT
type,
query_start_time,
query_duration_ms,
@ -448,9 +448,9 @@ WHERE type != 'QueryStart'
AND exception != ''
ORDER BY query_start_time DESC
LIMIT 10
'''
"""
SELECT_STACK_TRACES = r'''SELECT
SELECT_STACK_TRACES = r"""SELECT
'\n' || arrayStringConcat(
arrayMap(
x,
@ -459,9 +459,9 @@ SELECT_STACK_TRACES = r'''SELECT
arrayMap(x -> demangle(addressToSymbol(x)), trace)),
'\n') AS trace
FROM system.stack_trace
'''
"""
SELECT_CRASH_LOG = r'''SELECT
SELECT_CRASH_LOG = r"""SELECT
event_time,
signal,
thread_id,
@ -470,7 +470,7 @@ SELECT_CRASH_LOG = r'''SELECT
version
FROM system.crash_log
ORDER BY event_time DESC
'''
"""
def retry(exception_types, max_attempts=5, max_interval=5):
@ -481,7 +481,8 @@ def retry(exception_types, max_attempts=5, max_interval=5):
retry=tenacity.retry_if_exception_type(exception_types),
wait=tenacity.wait_random_exponential(multiplier=0.5, max=max_interval),
stop=tenacity.stop_after_attempt(max_attempts),
reraise=True)
reraise=True,
)
class ClickhouseError(Exception):
@ -502,9 +503,9 @@ class ClickhouseClient:
def __init__(self, *, host="localhost", port=8123, user="default", password):
self._session = requests.Session()
if user:
self._session.headers['X-ClickHouse-User'] = user
self._session.headers['X-ClickHouse-Key'] = password
self._url = f'http://{host}:{port}'
self._session.headers["X-ClickHouse-User"] = user
self._session.headers["X-ClickHouse-Key"] = password
self._url = f"http://{host}:{port}"
self._timeout = 60
self._ch_version = None
@ -516,7 +517,16 @@ class ClickhouseClient:
return self._ch_version
@retry(requests.exceptions.ConnectionError)
def query(self, query, query_args=None, format=None, post_data=None, timeout=None, echo=False, dry_run=False):
def query(
self,
query,
query_args=None,
format=None,
post_data=None,
timeout=None,
echo=False,
dry_run=False,
):
"""
Execute query.
"""
@ -524,28 +534,30 @@ class ClickhouseClient:
query = self.render_query(query, **query_args)
if format:
query += f' FORMAT {format}'
query += f" FORMAT {format}"
if timeout is None:
timeout = self._timeout
if echo:
print(sqlparse.format(query, reindent=True), '\n')
print(sqlparse.format(query, reindent=True), "\n")
if dry_run:
return None
try:
response = self._session.post(self._url,
params={
'query': query,
},
json=post_data,
timeout=timeout)
response = self._session.post(
self._url,
params={
"query": query,
},
json=post_data,
timeout=timeout,
)
response.raise_for_status()
if format in ('JSON', 'JSONCompact'):
if format in ("JSON", "JSONCompact"):
return response.json()
return response.text.strip()
@ -555,7 +567,9 @@ class ClickhouseClient:
def render_query(self, query, **kwargs):
env = jinja2.Environment()
env.globals['version_ge'] = lambda version: version_ge(self.clickhouse_version, version)
env.globals["version_ge"] = lambda version: version_ge(
self.clickhouse_version, version
)
template = env.from_string(query)
return template.render(kwargs)
@ -578,11 +592,13 @@ class ClickhouseConfig:
@classmethod
def load(cls):
return ClickhouseConfig(cls._load_config('/var/lib/clickhouse/preprocessed_configs/config.xml'))
return ClickhouseConfig(
cls._load_config("/var/lib/clickhouse/preprocessed_configs/config.xml")
)
@staticmethod
def _load_config(config_path):
with open(config_path, 'r') as file:
with open(config_path, "r") as file:
return xmltodict.parse(file.read())
@classmethod
@ -591,8 +607,8 @@ class ClickhouseConfig:
for key, value in list(config.items()):
if isinstance(value, MutableMapping):
cls._mask_secrets(config[key])
elif key in ('password', 'secret_access_key', 'header', 'identity'):
config[key] = '*****'
elif key in ("password", "secret_access_key", "header", "identity"):
config[key] = "*****"
class DiagnosticsData:
@ -603,53 +619,53 @@ class DiagnosticsData:
def __init__(self, args):
self.args = args
self.host = args.host
self._sections = [{'section': None, 'data': {}}]
self._sections = [{"section": None, "data": {}}]
def add_string(self, name, value, section=None):
self._section(section)[name] = {
'type': 'string',
'value': value,
"type": "string",
"value": value,
}
def add_xml_document(self, name, document, section=None):
self._section(section)[name] = {
'type': 'xml',
'value': document,
"type": "xml",
"value": document,
}
def add_query(self, name, query, result, section=None):
self._section(section)[name] = {
'type': 'query',
'query': query,
'result': result,
"type": "query",
"query": query,
"result": result,
}
def add_command(self, name, command, result, section=None):
self._section(section)[name] = {
'type': 'command',
'command': command,
'result': result,
"type": "command",
"command": command,
"result": result,
}
def dump(self, format):
if format.startswith('json'):
if format.startswith("json"):
result = self._dump_json()
elif format.startswith('yaml'):
elif format.startswith("yaml"):
result = self._dump_yaml()
else:
result = self._dump_wiki()
if format.endswith('.gz'):
compressor = gzip.GzipFile(mode='wb', fileobj=sys.stdout.buffer)
if format.endswith(".gz"):
compressor = gzip.GzipFile(mode="wb", fileobj=sys.stdout.buffer)
compressor.write(result.encode())
else:
print(result)
def _section(self, name=None):
if self._sections[-1]['section'] != name:
self._sections.append({'section': name, 'data': {}})
if self._sections[-1]["section"] != name:
self._sections.append({"section": name, "data": {}})
return self._sections[-1]['data']
return self._sections[-1]["data"]
def _dump_json(self):
"""
@ -669,85 +685,85 @@ class DiagnosticsData:
"""
def _write_title(buffer, value):
buffer.write(f'### {value}\n')
buffer.write(f"### {value}\n")
def _write_subtitle(buffer, value):
buffer.write(f'#### {value}\n')
buffer.write(f"#### {value}\n")
def _write_string_item(buffer, name, item):
value = item['value']
if value != '':
value = f'**{value}**'
buffer.write(f'{name}: {value}\n')
value = item["value"]
if value != "":
value = f"**{value}**"
buffer.write(f"{name}: {value}\n")
def _write_xml_item(buffer, section_name, name, item):
if section_name:
buffer.write(f'##### {name}\n')
buffer.write(f"##### {name}\n")
else:
_write_subtitle(buffer, name)
_write_result(buffer, item['value'], format='XML')
_write_result(buffer, item["value"], format="XML")
def _write_query_item(buffer, section_name, name, item):
if section_name:
buffer.write(f'##### {name}\n')
buffer.write(f"##### {name}\n")
else:
_write_subtitle(buffer, name)
_write_query(buffer, item['query'])
_write_result(buffer, item['result'])
_write_query(buffer, item["query"])
_write_result(buffer, item["result"])
def _write_command_item(buffer, section_name, name, item):
if section_name:
buffer.write(f'##### {name}\n')
buffer.write(f"##### {name}\n")
else:
_write_subtitle(buffer, name)
_write_command(buffer, item['command'])
_write_result(buffer, item['result'])
_write_command(buffer, item["command"])
_write_result(buffer, item["result"])
def _write_unknown_item(buffer, section_name, name, item):
if section_name:
buffer.write(f'**{name}**\n')
buffer.write(f"**{name}**\n")
else:
_write_subtitle(buffer, name)
json.dump(item, buffer, indent=2)
def _write_query(buffer, query):
buffer.write('**query**\n')
buffer.write('```sql\n')
buffer.write("**query**\n")
buffer.write("```sql\n")
buffer.write(query)
buffer.write('\n```\n')
buffer.write("\n```\n")
def _write_command(buffer, command):
buffer.write('**command**\n')
buffer.write('```\n')
buffer.write("**command**\n")
buffer.write("```\n")
buffer.write(command)
buffer.write('\n```\n')
buffer.write("\n```\n")
def _write_result(buffer, result, format=None):
buffer.write('**result**\n')
buffer.write(f'```{format}\n' if format else '```\n')
buffer.write("**result**\n")
buffer.write(f"```{format}\n" if format else "```\n")
buffer.write(result)
buffer.write('\n```\n')
buffer.write("\n```\n")
buffer = io.StringIO()
_write_title(buffer, f'Diagnostics data for host {self.host}')
_write_title(buffer, f"Diagnostics data for host {self.host}")
for section in self._sections:
section_name = section['section']
section_name = section["section"]
if section_name:
_write_subtitle(buffer, section_name)
for name, item in section['data'].items():
if item['type'] == 'string':
for name, item in section["data"].items():
if item["type"] == "string":
_write_string_item(buffer, name, item)
elif item['type'] == 'query':
elif item["type"] == "query":
_write_query_item(buffer, section_name, name, item)
elif item['type'] == 'command':
elif item["type"] == "command":
_write_command_item(buffer, section_name, name, item)
elif item['type'] == 'xml':
elif item["type"] == "xml":
_write_xml_item(buffer, section_name, name, item)
else:
_write_unknown_item(buffer, section_name, name, item)
@ -760,126 +776,196 @@ def main():
Program entry point.
"""
args = parse_args()
timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S')
client = ClickhouseClient(host=args.host, port=args.port, user=args.user, password=args.password)
timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
client = ClickhouseClient(
host=args.host, port=args.port, user=args.user, password=args.password
)
ch_config = ClickhouseConfig.load()
version = client.clickhouse_version
system_tables = [row[0] for row in execute_query(client, SELECT_SYSTEM_TABLES, format='JSONCompact')['data']]
system_tables = [
row[0]
for row in execute_query(client, SELECT_SYSTEM_TABLES, format="JSONCompact")[
"data"
]
]
diagnostics = DiagnosticsData(args)
diagnostics.add_string('Version', version)
diagnostics.add_string('Timestamp', timestamp)
diagnostics.add_string('Uptime', execute_query(client, SELECT_UPTIME))
diagnostics.add_string("Version", version)
diagnostics.add_string("Timestamp", timestamp)
diagnostics.add_string("Uptime", execute_query(client, SELECT_UPTIME))
diagnostics.add_xml_document('ClickHouse configuration', ch_config.dump())
diagnostics.add_xml_document("ClickHouse configuration", ch_config.dump())
if version_ge(version, '20.8'):
add_query(diagnostics, 'Access configuration',
client=client,
query=SELECT_ACCESS,
format='TSVRaw')
add_query(diagnostics, 'Quotas',
client=client,
query=SELECT_QUOTA_USAGE,
format='Vertical')
if version_ge(version, "20.8"):
add_query(
diagnostics,
"Access configuration",
client=client,
query=SELECT_ACCESS,
format="TSVRaw",
)
add_query(
diagnostics,
"Quotas",
client=client,
query=SELECT_QUOTA_USAGE,
format="Vertical",
)
add_query(diagnostics, 'Database engines',
client=client,
query=SELECT_DATABASE_ENGINES,
format='PrettyCompactNoEscapes',
section='Schema')
add_query(diagnostics, 'Databases (top 10 by size)',
client=client,
query=SELECT_DATABASES,
format='PrettyCompactNoEscapes',
section='Schema')
add_query(diagnostics, 'Table engines',
client=client,
query=SELECT_TABLE_ENGINES,
format='PrettyCompactNoEscapes',
section='Schema')
add_query(diagnostics, 'Dictionaries',
client=client,
query=SELECT_DICTIONARIES,
format='PrettyCompactNoEscapes',
section='Schema')
add_query(
diagnostics,
"Database engines",
client=client,
query=SELECT_DATABASE_ENGINES,
format="PrettyCompactNoEscapes",
section="Schema",
)
add_query(
diagnostics,
"Databases (top 10 by size)",
client=client,
query=SELECT_DATABASES,
format="PrettyCompactNoEscapes",
section="Schema",
)
add_query(
diagnostics,
"Table engines",
client=client,
query=SELECT_TABLE_ENGINES,
format="PrettyCompactNoEscapes",
section="Schema",
)
add_query(
diagnostics,
"Dictionaries",
client=client,
query=SELECT_DICTIONARIES,
format="PrettyCompactNoEscapes",
section="Schema",
)
add_query(diagnostics, 'Replicated tables (top 10 by absolute delay)',
client=client,
query=SELECT_REPLICAS,
format='PrettyCompactNoEscapes',
section='Replication')
add_query(diagnostics, 'Replication queue (top 20 oldest tasks)',
client=client,
query=SELECT_REPLICATION_QUEUE,
format='Vertical',
section='Replication')
if version_ge(version, '21.3'):
add_query(diagnostics, 'Replicated fetches',
client=client,
query=SELECT_REPLICATED_FETCHES,
format='Vertical',
section='Replication')
add_query(
diagnostics,
"Replicated tables (top 10 by absolute delay)",
client=client,
query=SELECT_REPLICAS,
format="PrettyCompactNoEscapes",
section="Replication",
)
add_query(
diagnostics,
"Replication queue (top 20 oldest tasks)",
client=client,
query=SELECT_REPLICATION_QUEUE,
format="Vertical",
section="Replication",
)
if version_ge(version, "21.3"):
add_query(
diagnostics,
"Replicated fetches",
client=client,
query=SELECT_REPLICATED_FETCHES,
format="Vertical",
section="Replication",
)
add_query(diagnostics, 'Top 10 tables by max parts per partition',
client=client,
query=SELECT_PARTS_PER_TABLE,
format='PrettyCompactNoEscapes')
add_query(diagnostics, 'Merges in progress',
client=client,
query=SELECT_MERGES,
format='Vertical')
add_query(diagnostics, 'Mutations in progress',
client=client,
query=SELECT_MUTATIONS,
format='Vertical')
add_query(diagnostics, 'Recent data parts (modification time within last 3 minutes)',
client=client,
query=SELECT_RECENT_DATA_PARTS,
format='Vertical')
add_query(
diagnostics,
"Top 10 tables by max parts per partition",
client=client,
query=SELECT_PARTS_PER_TABLE,
format="PrettyCompactNoEscapes",
)
add_query(
diagnostics,
"Merges in progress",
client=client,
query=SELECT_MERGES,
format="Vertical",
)
add_query(
diagnostics,
"Mutations in progress",
client=client,
query=SELECT_MUTATIONS,
format="Vertical",
)
add_query(
diagnostics,
"Recent data parts (modification time within last 3 minutes)",
client=client,
query=SELECT_RECENT_DATA_PARTS,
format="Vertical",
)
add_query(diagnostics, 'system.detached_parts',
client=client,
query=SELECT_DETACHED_DATA_PARTS,
format='PrettyCompactNoEscapes',
section='Detached data')
add_command(diagnostics, 'Disk space usage',
command='du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh',
section='Detached data')
add_query(
diagnostics,
"system.detached_parts",
client=client,
query=SELECT_DETACHED_DATA_PARTS,
format="PrettyCompactNoEscapes",
section="Detached data",
)
add_command(
diagnostics,
"Disk space usage",
command="du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh",
section="Detached data",
)
add_query(diagnostics, 'Queries in progress (process list)',
client=client,
query=SELECT_PROCESSES,
format='Vertical',
section='Queries')
add_query(diagnostics, 'Top 10 queries by duration',
client=client,
query=SELECT_TOP_QUERIES_BY_DURATION,
format='Vertical',
section='Queries')
add_query(diagnostics, 'Top 10 queries by memory usage',
client=client,
query=SELECT_TOP_QUERIES_BY_MEMORY_USAGE,
format='Vertical',
section='Queries')
add_query(diagnostics, 'Last 10 failed queries',
client=client,
query=SELECT_FAILED_QUERIES,
format='Vertical',
section='Queries')
add_query(
diagnostics,
"Queries in progress (process list)",
client=client,
query=SELECT_PROCESSES,
format="Vertical",
section="Queries",
)
add_query(
diagnostics,
"Top 10 queries by duration",
client=client,
query=SELECT_TOP_QUERIES_BY_DURATION,
format="Vertical",
section="Queries",
)
add_query(
diagnostics,
"Top 10 queries by memory usage",
client=client,
query=SELECT_TOP_QUERIES_BY_MEMORY_USAGE,
format="Vertical",
section="Queries",
)
add_query(
diagnostics,
"Last 10 failed queries",
client=client,
query=SELECT_FAILED_QUERIES,
format="Vertical",
section="Queries",
)
add_query(diagnostics, 'Stack traces',
client=client,
query=SELECT_STACK_TRACES,
format='Vertical')
add_query(
diagnostics,
"Stack traces",
client=client,
query=SELECT_STACK_TRACES,
format="Vertical",
)
if 'crash_log' in system_tables:
add_query(diagnostics, 'Crash log',
client=client,
query=SELECT_CRASH_LOG,
format='Vertical')
if "crash_log" in system_tables:
add_query(
diagnostics,
"Crash log",
client=client,
query=SELECT_CRASH_LOG,
format="Vertical",
)
add_command(diagnostics, 'uname', 'uname -a')
add_command(diagnostics, "uname", "uname -a")
diagnostics.dump(args.format)
@ -889,29 +975,34 @@ def parse_args():
Parse command-line arguments.
"""
parser = argparse.ArgumentParser()
parser.add_argument('--format',
choices=['json', 'yaml', 'json.gz', 'yaml.gz', 'wiki', 'wiki.gz'],
default='wiki')
parser.add_argument('--normalize-queries',
action='store_true',
default=False)
parser.add_argument('--host', dest="host", help="clickhouse host")
parser.add_argument('--port', dest="port", default=8123, help="clickhouse http port")
parser.add_argument('--user', dest="user", default="default", help="clickhouse user")
parser.add_argument('--password', dest="password", help="clickhouse password")
parser.add_argument(
"--format",
choices=["json", "yaml", "json.gz", "yaml.gz", "wiki", "wiki.gz"],
default="wiki",
)
parser.add_argument("--normalize-queries", action="store_true", default=False)
parser.add_argument("--host", dest="host", help="clickhouse host")
parser.add_argument(
"--port", dest="port", default=8123, help="clickhouse http port"
)
parser.add_argument(
"--user", dest="user", default="default", help="clickhouse user"
)
parser.add_argument("--password", dest="password", help="clickhouse password")
return parser.parse_args()
def add_query(diagnostics, name, client, query, format, section=None):
query_args = {
'normalize_queries': diagnostics.args.normalize_queries,
"normalize_queries": diagnostics.args.normalize_queries,
}
query = client.render_query(query, **query_args)
diagnostics.add_query(
name=name,
query=query,
result=execute_query(client, query, render_query=False, format=format),
section=section)
section=section,
)
def execute_query(client, query, render_query=True, format=None):
@ -926,14 +1017,18 @@ def execute_query(client, query, render_query=True, format=None):
def add_command(diagnostics, name, command, section=None):
diagnostics.add_command(
name=name,
command=command,
result=execute_command(command),
section=section)
name=name, command=command, result=execute_command(command), section=section
)
def execute_command(command, input=None):
proc = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
proc = subprocess.Popen(
command,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if isinstance(input, str):
input = input.encode()
@ -941,7 +1036,7 @@ def execute_command(command, input=None):
stdout, stderr = proc.communicate(input=input)
if proc.returncode:
return f'failed with exit code {proc.returncode}\n{stderr.decode()}'
return f"failed with exit code {proc.returncode}\n{stderr.decode()}"
return stdout.decode()
@ -957,8 +1052,8 @@ def parse_version(version):
"""
Parse version string.
"""
return [int(x) for x in version.strip().split('.') if x.isnumeric()]
return [int(x) for x in version.strip().split(".") if x.isnumeric()]
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -28,39 +28,48 @@ class S3API(object):
bucket = self.connection.get_bucket(bucket_name)
key = bucket.initiate_multipart_upload(s3_path)
logging.info("Will upload to s3 path %s", s3_path)
chunksize = 1024 * 1024 * 1024 # 1 GB
chunksize = 1024 * 1024 * 1024 # 1 GB
filesize = os.stat(file_path).st_size
logging.info("File size is %s", filesize)
chunkcount = int(math.ceil(filesize / chunksize))
def call_back(x, y):
print("Uploaded {}/{} bytes".format(x, y))
try:
for i in range(chunkcount + 1):
logging.info("Uploading chunk %s of %s", i, chunkcount + 1)
offset = chunksize * i
bytes_size = min(chunksize, filesize - offset)
with open(file_path, 'r') as fp:
with open(file_path, "r") as fp:
fp.seek(offset)
key.upload_part_from_file(fp=fp, part_num=i+1,
size=bytes_size, cb=call_back,
num_cb=100)
key.upload_part_from_file(
fp=fp, part_num=i + 1, size=bytes_size, cb=call_back, num_cb=100
)
key.complete_upload()
except Exception as ex:
key.cancel_upload()
raise ex
logging.info("Contents were set")
return "https://{bucket}.{mds_url}/{path}".format(
bucket=bucket_name, mds_url=self.mds_url, path=s3_path)
bucket=bucket_name, mds_url=self.mds_url, path=s3_path
)
def set_file_contents(self, bucket, local_file_path, s3_file_path):
key = Key(bucket)
key.key = s3_file_path
file_size = os.stat(local_file_path).st_size
logging.info("Uploading file `%s` to `%s`. Size is %s", local_file_path, s3_file_path, file_size)
logging.info(
"Uploading file `%s` to `%s`. Size is %s",
local_file_path,
s3_file_path,
file_size,
)
def call_back(x, y):
print("Uploaded {}/{} bytes".format(x, y))
key.set_contents_from_filename(local_file_path, cb=call_back)
def upload_data_for_static_files_disk(self, bucket_name, directory_path, s3_path):
@ -74,12 +83,14 @@ class S3API(object):
path = root.split(os.sep)
for file in files:
local_file_path = os.path.join(root, file)
s3_file = local_file_path[len(directory_path) + 1:]
s3_file = local_file_path[len(directory_path) + 1 :]
s3_file_path = os.path.join(s3_path, s3_file)
self.set_file_contents(bucket, local_file_path, s3_file_path)
logging.info("Uploading finished")
return "https://{bucket}.{mds_url}/{path}".format(bucket=bucket_name, mds_url=self.mds_url, path=s3_path)
return "https://{bucket}.{mds_url}/{path}".format(
bucket=bucket_name, mds_url=self.mds_url, path=s3_path
)
def list_bucket_keys(self, bucket_name):
bucket = self.connection.get_bucket(bucket_name)
@ -91,100 +102,121 @@ class S3API(object):
bucket.get_all_keys()
for obj in bucket.get_all_keys():
if obj.key.startswith(folder_path):
print('Removing ' + obj.key)
print("Removing " + obj.key)
obj.delete()
def make_tar_file_for_table(clickhouse_data_path, db_name, table_name,
tmp_prefix):
def make_tar_file_for_table(clickhouse_data_path, db_name, table_name, tmp_prefix):
relative_data_path = os.path.join('data', db_name, table_name)
relative_meta_path = os.path.join('metadata', db_name, table_name + '.sql')
relative_data_path = os.path.join("data", db_name, table_name)
relative_meta_path = os.path.join("metadata", db_name, table_name + ".sql")
path_to_data = os.path.join(clickhouse_data_path, relative_data_path)
path_to_metadata = os.path.join(clickhouse_data_path, relative_meta_path)
temporary_file_name = tmp_prefix + '/{tname}.tar'.format(tname=table_name)
temporary_file_name = tmp_prefix + "/{tname}.tar".format(tname=table_name)
with tarfile.open(temporary_file_name, "w") as bundle:
bundle.add(path_to_data, arcname=relative_data_path)
bundle.add(path_to_metadata, arcname=relative_meta_path)
return temporary_file_name
USAGE_EXAMPLES = '''
USAGE_EXAMPLES = """
examples:
\t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --clickhouse-data-path /opt/clickhouse/ --table-name default.some_tbl --bucket-name some-bucket
\t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --file-path some_ds.tsv.xz --bucket-name some-bucket --s3-path /path/to/
'''
"""
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
parser = argparse.ArgumentParser(
description="Simple tool for uploading datasets to clickhouse S3",
usage='%(prog)s [options] {}'.format(USAGE_EXAMPLES))
parser.add_argument('--s3-api-url', default='s3.amazonaws.com')
parser.add_argument('--s3-common-url', default='s3.amazonaws.com')
parser.add_argument('--bucket-name', default='clickhouse-datasets')
parser.add_argument('--dataset-name', required=True,
help='Name of dataset, will be used in uploaded path')
parser.add_argument('--access-key-id', required=True)
parser.add_argument('--secret-access-key', required=True)
parser.add_argument('--clickhouse-data-path',
default='/var/lib/clickhouse/',
help='Path to clickhouse database on filesystem')
parser.add_argument('--s3-path', help='Path in s3, where to upload file')
parser.add_argument('--tmp-prefix', default='/tmp',
help='Prefix to store temporary downloaded file')
usage="%(prog)s [options] {}".format(USAGE_EXAMPLES),
)
parser.add_argument("--s3-api-url", default="s3.amazonaws.com")
parser.add_argument("--s3-common-url", default="s3.amazonaws.com")
parser.add_argument("--bucket-name", default="clickhouse-datasets")
parser.add_argument(
"--dataset-name",
required=True,
help="Name of dataset, will be used in uploaded path",
)
parser.add_argument("--access-key-id", required=True)
parser.add_argument("--secret-access-key", required=True)
parser.add_argument(
"--clickhouse-data-path",
default="/var/lib/clickhouse/",
help="Path to clickhouse database on filesystem",
)
parser.add_argument("--s3-path", help="Path in s3, where to upload file")
parser.add_argument(
"--tmp-prefix", default="/tmp", help="Prefix to store temporary downloaded file"
)
data_group = parser.add_mutually_exclusive_group(required=True)
table_name_argument = data_group.add_argument('--table-name',
help='Name of table with database, if you are uploading partitions')
data_group.add_argument('--file-path',
help='Name of file, if you are uploading')
data_group.add_argument('--directory-path', help='Path to directory with files to upload')
data_group.add_argument('--list-directory', help='List s3 directory by --directory-path')
data_group.add_argument('--remove-directory', help='Remove s3 directory by --directory-path')
table_name_argument = data_group.add_argument(
"--table-name",
help="Name of table with database, if you are uploading partitions",
)
data_group.add_argument("--file-path", help="Name of file, if you are uploading")
data_group.add_argument(
"--directory-path", help="Path to directory with files to upload"
)
data_group.add_argument(
"--list-directory", help="List s3 directory by --directory-path"
)
data_group.add_argument(
"--remove-directory", help="Remove s3 directory by --directory-path"
)
args = parser.parse_args()
if args.table_name is not None and args.clickhouse_data_path is None:
raise argparse.ArgumentError(table_name_argument,
"You should specify --clickhouse-data-path to upload --table")
raise argparse.ArgumentError(
table_name_argument,
"You should specify --clickhouse-data-path to upload --table",
)
s3_conn = S3API(
args.access_key_id, args.secret_access_key,
args.s3_api_url, args.s3_common_url)
args.access_key_id, args.secret_access_key, args.s3_api_url, args.s3_common_url
)
file_path = ''
file_path = ""
directory_path = args.directory_path
s3_path = args.s3_path
if args.list_directory:
s3_conn.list_bucket_keys(args.bucket_name)
elif args.remove_directory:
print('Removing s3 path: ' + args.remove_directory)
print("Removing s3 path: " + args.remove_directory)
s3_conn.remove_folder_from_bucket(args.bucket_name, args.remove_directory)
elif args.directory_path is not None:
url = s3_conn.upload_data_for_static_files_disk(args.bucket_name, directory_path, s3_path)
url = s3_conn.upload_data_for_static_files_disk(
args.bucket_name, directory_path, s3_path
)
logging.info("Data uploaded: %s", url)
else:
if args.table_name is not None:
if '.' not in args.table_name:
db_name = 'default'
if "." not in args.table_name:
db_name = "default"
else:
db_name, table_name = args.table_name.split('.')
db_name, table_name = args.table_name.split(".")
file_path = make_tar_file_for_table(
args.clickhouse_data_path, db_name, table_name, args.tmp_prefix)
args.clickhouse_data_path, db_name, table_name, args.tmp_prefix
)
else:
file_path = args.file_path
if 'tsv' in file_path:
if "tsv" in file_path:
s3_path = os.path.join(
args.dataset_name, 'tsv', os.path.basename(file_path))
args.dataset_name, "tsv", os.path.basename(file_path)
)
if args.table_name is not None:
s3_path = os.path.join(
args.dataset_name, 'partitions', os.path.basename(file_path))
args.dataset_name, "partitions", os.path.basename(file_path)
)
elif args.s3_path is not None:
s3_path = os.path.join(
args.dataset_name, args.s3_path, os.path.basename(file_path))
args.dataset_name, args.s3_path, os.path.basename(file_path)
)
else:
raise Exception("Don't know s3-path to upload")

View File

@ -11,13 +11,14 @@ from termcolor import colored
import sys
COLORMAP = {
"success": colored("success", 'green'),
"failure": colored("failure", 'red'),
"error": colored("error", 'red'),
"pending": colored("pending", 'yellow'),
"not run": colored("not run", 'white'),
"success": colored("success", "green"),
"failure": colored("failure", "red"),
"error": colored("error", "red"),
"pending": colored("pending", "yellow"),
"not run": colored("not run", "white"),
}
def _filter_statuses(statuses):
"""
Squash statuses to latest state
@ -69,7 +70,7 @@ if __name__ == "__main__":
date_since = datetime.datetime.strptime(args.since, "%Y-%m-%d %H:%M:%S")
gh = Github(args.token)
repo = gh.get_repo('ClickHouse/ClickHouse')
repo = gh.get_repo("ClickHouse/ClickHouse")
commits = get_commits(repo, date_since)
longest_header = []
@ -101,6 +102,6 @@ if __name__ == "__main__":
result_data.append(current_result)
if sys.stdout.isatty():
longest_header = [colored(h, 'white', attrs=['bold']) for h in longest_header]
longest_header = [colored(h, "white", attrs=["bold"]) for h in longest_header]
print(tabulate.tabulate(result_data, headers=longest_header, tablefmt="grid"))