apply black formatter

This commit is contained in:
Anton Popov 2023-03-23 15:33:23 +00:00
parent 21f5d20b9e
commit 0ee8dfad53
31 changed files with 1999 additions and 1059 deletions

View File

@ -10,31 +10,38 @@ import requests
import tempfile import tempfile
DEFAULT_URL = 'https://clickhouse-datasets.s3.amazonaws.com' DEFAULT_URL = "https://clickhouse-datasets.s3.amazonaws.com"
AVAILABLE_DATASETS = { AVAILABLE_DATASETS = {
'hits': 'hits_v1.tar', "hits": "hits_v1.tar",
'visits': 'visits_v1.tar', "visits": "visits_v1.tar",
} }
RETRIES_COUNT = 5 RETRIES_COUNT = 5
def _get_temp_file_name(): def _get_temp_file_name():
return os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())) return os.path.join(
tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
)
def build_url(base_url, dataset): def build_url(base_url, dataset):
return os.path.join(base_url, dataset, 'partitions', AVAILABLE_DATASETS[dataset]) return os.path.join(base_url, dataset, "partitions", AVAILABLE_DATASETS[dataset])
def dowload_with_progress(url, path): def dowload_with_progress(url, path):
logging.info("Downloading from %s to temp path %s", url, path) logging.info("Downloading from %s to temp path %s", url, path)
for i in range(RETRIES_COUNT): for i in range(RETRIES_COUNT):
try: try:
with open(path, 'wb') as f: with open(path, "wb") as f:
response = requests.get(url, stream=True) response = requests.get(url, stream=True)
response.raise_for_status() response.raise_for_status()
total_length = response.headers.get('content-length') total_length = response.headers.get("content-length")
if total_length is None or int(total_length) == 0: if total_length is None or int(total_length) == 0:
logging.info("No content-length, will download file without progress") logging.info(
"No content-length, will download file without progress"
)
f.write(response.content) f.write(response.content)
else: else:
dl = 0 dl = 0
@ -46,7 +53,11 @@ def dowload_with_progress(url, path):
if sys.stdout.isatty(): if sys.stdout.isatty():
done = int(50 * dl / total_length) done = int(50 * dl / total_length)
percent = int(100 * float(dl) / total_length) percent = int(100 * float(dl) / total_length)
sys.stdout.write("\r[{}{}] {}%".format('=' * done, ' ' * (50-done), percent)) sys.stdout.write(
"\r[{}{}] {}%".format(
"=" * done, " " * (50 - done), percent
)
)
sys.stdout.flush() sys.stdout.flush()
break break
except Exception as ex: except Exception as ex:
@ -56,14 +67,21 @@ def dowload_with_progress(url, path):
if os.path.exists(path): if os.path.exists(path):
os.remove(path) os.remove(path)
else: else:
raise Exception("Cannot download dataset from {}, all retries exceeded".format(url)) raise Exception(
"Cannot download dataset from {}, all retries exceeded".format(url)
)
sys.stdout.write("\n") sys.stdout.write("\n")
logging.info("Downloading finished") logging.info("Downloading finished")
def unpack_to_clickhouse_directory(tar_path, clickhouse_path): def unpack_to_clickhouse_directory(tar_path, clickhouse_path):
logging.info("Will unpack data from temp path %s to clickhouse db %s", tar_path, clickhouse_path) logging.info(
with tarfile.open(tar_path, 'r') as comp_file: "Will unpack data from temp path %s to clickhouse db %s",
tar_path,
clickhouse_path,
)
with tarfile.open(tar_path, "r") as comp_file:
comp_file.extractall(path=clickhouse_path) comp_file.extractall(path=clickhouse_path)
logging.info("Unpack finished") logging.info("Unpack finished")
@ -72,15 +90,21 @@ if __name__ == "__main__":
logging.basicConfig(level=logging.INFO) logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Simple tool for dowloading datasets for clickhouse from S3") description="Simple tool for dowloading datasets for clickhouse from S3"
)
parser.add_argument('--dataset-names', required=True, nargs='+', choices=list(AVAILABLE_DATASETS.keys())) parser.add_argument(
parser.add_argument('--url-prefix', default=DEFAULT_URL) "--dataset-names",
parser.add_argument('--clickhouse-data-path', default='/var/lib/clickhouse/') required=True,
nargs="+",
choices=list(AVAILABLE_DATASETS.keys()),
)
parser.add_argument("--url-prefix", default=DEFAULT_URL)
parser.add_argument("--clickhouse-data-path", default="/var/lib/clickhouse/")
args = parser.parse_args() args = parser.parse_args()
datasets = args.dataset_names datasets = args.dataset_names
logging.info("Will fetch following datasets: %s", ', '.join(datasets)) logging.info("Will fetch following datasets: %s", ", ".join(datasets))
for dataset in datasets: for dataset in datasets:
logging.info("Processing %s", dataset) logging.info("Processing %s", dataset)
temp_archive_path = _get_temp_file_name() temp_archive_path = _get_temp_file_name()
@ -92,10 +116,11 @@ if __name__ == "__main__":
logging.info("Some exception occured %s", str(ex)) logging.info("Some exception occured %s", str(ex))
raise raise
finally: finally:
logging.info("Will remove downloaded file %s from filesystem if it exists", temp_archive_path) logging.info(
"Will remove downloaded file %s from filesystem if it exists",
temp_archive_path,
)
if os.path.exists(temp_archive_path): if os.path.exists(temp_archive_path):
os.remove(temp_archive_path) os.remove(temp_archive_path)
logging.info("Processing of %s finished", dataset) logging.info("Processing of %s finished", dataset)
logging.info("Fetch finished, enjoy your tables!") logging.info("Fetch finished, enjoy your tables!")

View File

@ -77,7 +77,7 @@ def trim_for_log(s):
return s return s
lines = s.splitlines() lines = s.splitlines()
if len(lines) > 10000: if len(lines) > 10000:
separator = "-" * 40 + str(len(lines) - 10000) + " lines are hidden" + "-" * 40 separator = "-" * 40 + str(len(lines) - 10000) + " lines are hidden" + "-" * 40
return "\n".join(lines[:5000] + [] + [separator] + [] + lines[-5000:]) return "\n".join(lines[:5000] + [] + [separator] + [] + lines[-5000:])
else: else:
return "\n".join(lines) return "\n".join(lines)
@ -95,7 +95,13 @@ class HTTPError(Exception):
# Helpers to execute queries via HTTP interface. # Helpers to execute queries via HTTP interface.
def clickhouse_execute_http( def clickhouse_execute_http(
base_args, query, timeout=30, settings=None, default_format=None, max_http_retries=5, retry_error_codes=False base_args,
query,
timeout=30,
settings=None,
default_format=None,
max_http_retries=5,
retry_error_codes=False,
): ):
if args.secure: if args.secure:
client = http.client.HTTPSConnection( client = http.client.HTTPSConnection(
@ -146,12 +152,36 @@ def clickhouse_execute_http(
return data return data
def clickhouse_execute(base_args, query, timeout=30, settings=None, max_http_retries=5, retry_error_codes=False):
return clickhouse_execute_http(base_args, query, timeout, settings, max_http_retries=max_http_retries, retry_error_codes=retry_error_codes).strip() def clickhouse_execute(
base_args,
query,
timeout=30,
settings=None,
max_http_retries=5,
retry_error_codes=False,
):
return clickhouse_execute_http(
base_args,
query,
timeout,
settings,
max_http_retries=max_http_retries,
retry_error_codes=retry_error_codes,
).strip()
def clickhouse_execute_json(base_args, query, timeout=60, settings=None, max_http_retries=5): def clickhouse_execute_json(
data = clickhouse_execute_http(base_args, query, timeout, settings, "JSONEachRow", max_http_retries=max_http_retries) base_args, query, timeout=60, settings=None, max_http_retries=5
):
data = clickhouse_execute_http(
base_args,
query,
timeout,
settings,
"JSONEachRow",
max_http_retries=max_http_retries,
)
if not data: if not data:
return None return None
rows = [] rows = []
@ -648,7 +678,9 @@ class TestCase:
clickhouse_execute( clickhouse_execute(
args, args,
"CREATE DATABASE IF NOT EXISTS " + database + get_db_engine(testcase_args, database), "CREATE DATABASE IF NOT EXISTS "
+ database
+ get_db_engine(testcase_args, database),
settings=get_create_database_settings(args, testcase_args), settings=get_create_database_settings(args, testcase_args),
) )
@ -831,7 +863,8 @@ class TestCase:
# TODO: remove checking "no-upgrade-check" after 23.1 # TODO: remove checking "no-upgrade-check" after 23.1
elif args.upgrade_check and ( elif args.upgrade_check and (
"no-upgrade-check" in tags or "no-upgrade-check" in tags): "no-upgrade-check" in tags or "no-upgrade-check" in tags
):
return FailureReason.NO_UPGRADE_CHECK return FailureReason.NO_UPGRADE_CHECK
elif tags and ("no-s3-storage" in tags) and args.s3_storage: elif tags and ("no-s3-storage" in tags) and args.s3_storage:
@ -1051,7 +1084,11 @@ class TestCase:
@staticmethod @staticmethod
def send_test_name_failed(suite: str, case: str): def send_test_name_failed(suite: str, case: str):
pid = os.getpid() pid = os.getpid()
clickhouse_execute(args, f"SELECT 'Running test {suite}/{case} from pid={pid}'", retry_error_codes=True) clickhouse_execute(
args,
f"SELECT 'Running test {suite}/{case} from pid={pid}'",
retry_error_codes=True,
)
def run_single_test( def run_single_test(
self, server_logs_level, client_options self, server_logs_level, client_options
@ -2220,6 +2257,7 @@ def find_binary(name):
raise Exception(f"{name} was not found in PATH") raise Exception(f"{name} was not found in PATH")
def find_clickhouse_command(binary, command): def find_clickhouse_command(binary, command):
symlink = binary + "-" + command symlink = binary + "-" + command
if os.access(symlink, os.X_OK): if os.access(symlink, os.X_OK):
@ -2228,6 +2266,7 @@ def find_clickhouse_command(binary, command):
# To avoid requiring symlinks (in case you download binary from CI) # To avoid requiring symlinks (in case you download binary from CI)
return binary + " " + command return binary + " " + command
def get_additional_client_options(args): def get_additional_client_options(args):
if args.client_option: if args.client_option:
return " ".join("--" + option for option in args.client_option) return " ".join("--" + option for option in args.client_option)
@ -2569,7 +2608,9 @@ if __name__ == "__main__":
"WARNING: --extract_from_config option is deprecated and will be removed the the future", "WARNING: --extract_from_config option is deprecated and will be removed the the future",
file=sys.stderr, file=sys.stderr,
) )
args.extract_from_config = find_clickhouse_command(args.binary, "extract-from-config") args.extract_from_config = find_clickhouse_command(
args.binary, "extract-from-config"
)
if args.configclient: if args.configclient:
args.client += " --config-file=" + args.configclient args.client += " --config-file=" + args.configclient

View File

@ -243,11 +243,18 @@ if __name__ == "__main__":
) )
parser.add_argument( parser.add_argument(
"--no-random", action="store", dest="no_random", help="Disable tests order randomization" "--no-random",
action="store",
dest="no_random",
help="Disable tests order randomization",
) )
parser.add_argument( parser.add_argument(
"--pre-pull", action="store_true", default=False, dest="pre_pull", help="Pull images for docker_compose before all other actions" "--pre-pull",
action="store_true",
default=False,
dest="pre_pull",
help="Pull images for docker_compose before all other actions",
) )
parser.add_argument( parser.add_argument(
@ -306,7 +313,6 @@ if __name__ == "__main__":
# if not args.no_random: # if not args.no_random:
# rand_args += f"--random-seed={os.getpid()}" # rand_args += f"--random-seed={os.getpid()}"
net = "" net = ""
if args.network: if args.network:
net = "--net={}".format(args.network) net = "--net={}".format(args.network)
@ -416,8 +422,11 @@ if __name__ == "__main__":
name=CONTAINER_NAME, name=CONTAINER_NAME,
) )
cmd = cmd_base + " " + args.command cmd = cmd_base + " " + args.command
cmd_pre_pull = cmd_base + " find /compose -name docker_compose_*.yml -exec docker-compose -f '{}' pull \;" cmd_pre_pull = (
cmd_base
+ " find /compose -name docker_compose_*.yml -exec docker-compose -f '{}' pull \;"
)
containers = subprocess.check_output( containers = subprocess.check_output(
f"docker ps --all --quiet --filter name={CONTAINER_NAME} --format={{{{.ID}}}}", f"docker ps --all --quiet --filter name={CONTAINER_NAME} --format={{{{.ID}}}}",

View File

@ -1,57 +1,72 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
def gen_queries(): def gen_queries():
create_template = 'create table tab_00386 (a Int8, b String, c Tuple(Int8), d Tuple(Tuple(Int8)), e Tuple(Int8, String), f Tuple(Tuple(Int8, String))) engine = MergeTree order by ({}) partition by {}' create_template = "create table tab_00386 (a Int8, b String, c Tuple(Int8), d Tuple(Tuple(Int8)), e Tuple(Int8, String), f Tuple(Tuple(Int8, String))) engine = MergeTree order by ({}) partition by {}"
drop_query = 'drop table if exists tab_00386' drop_query = "drop table if exists tab_00386"
values = ('1', "'a'", 'tuple(1)', 'tuple(tuple(1))', "(1, 'a')", "tuple((1, 'a'))") values = ("1", "'a'", "tuple(1)", "tuple(tuple(1))", "(1, 'a')", "tuple((1, 'a'))")
insert_query = "insert into tab_00386 values (1, 'a', tuple(1), tuple(tuple(1)), (1, 'a'), tuple((1, 'a')))" insert_query = "insert into tab_00386 values (1, 'a', tuple(1), tuple(tuple(1)), (1, 'a'), tuple((1, 'a')))"
columns = tuple('a b c d'.split()) columns = tuple("a b c d".split())
order_by_columns = tuple('a b c'.split()) order_by_columns = tuple("a b c".split())
partition_by_columns = tuple(' tuple() a'.split()) partition_by_columns = tuple(" tuple() a".split())
for partition in partition_by_columns: for partition in partition_by_columns:
for key_mask in range(1, 1 << len(order_by_columns)): for key_mask in range(1, 1 << len(order_by_columns)):
key = ','.join(order_by_columns[i] for i in range(len(order_by_columns)) if (1 << i) & key_mask != 0) key = ",".join(
order_by_columns[i]
for i in range(len(order_by_columns))
if (1 << i) & key_mask != 0
)
create_query = create_template.format(key, partition) create_query = create_template.format(key, partition)
for q in (drop_query, create_query, insert_query): for q in (drop_query, create_query, insert_query):
yield q yield q
for column, value in zip(columns, values): for column, value in zip(columns, values):
yield 'select {} in {} from tab_00386'.format(column, value) yield "select {} in {} from tab_00386".format(column, value)
yield 'select {} in tuple({}) from tab_00386'.format(column, value) yield "select {} in tuple({}) from tab_00386".format(column, value)
yield 'select {} in (select {} from tab_00386) from tab_00386'.format(column, column) yield "select {} in (select {} from tab_00386) from tab_00386".format(
column, column
)
for i in range(len(columns)): for i in range(len(columns)):
for j in range(i, len(columns)): for j in range(i, len(columns)):
yield 'select ({}, {}) in tuple({}, {}) from tab_00386'.format(columns[i], columns[j], values[i], values[j]) yield "select ({}, {}) in tuple({}, {}) from tab_00386".format(
yield 'select ({}, {}) in (select {}, {} from tab_00386) from tab_00386'.format(columns[i], columns[j], columns[i], columns[j]) columns[i], columns[j], values[i], values[j]
yield 'select ({}, {}) in (select ({}, {}) from tab_00386) from tab_00386'.format(columns[i], columns[j], columns[i], columns[j]) )
yield "select ({}, {}) in (select {}, {} from tab_00386) from tab_00386".format(
columns[i], columns[j], columns[i], columns[j]
)
yield "select ({}, {}) in (select ({}, {}) from tab_00386) from tab_00386".format(
columns[i], columns[j], columns[i], columns[j]
)
yield "select e in (1, 'a') from tab_00386" yield "select e in (1, 'a') from tab_00386"
yield "select f in tuple((1, 'a')) from tab_00386" yield "select f in tuple((1, 'a')) from tab_00386"
yield "select f in tuple(tuple((1, 'a'))) from tab_00386" yield "select f in tuple(tuple((1, 'a'))) from tab_00386"
yield 'select e in (select a, b from tab_00386) from tab_00386' yield "select e in (select a, b from tab_00386) from tab_00386"
yield 'select e in (select (a, b) from tab_00386) from tab_00386' yield "select e in (select (a, b) from tab_00386) from tab_00386"
yield 'select f in (select tuple((a, b)) from tab_00386) from tab_00386' yield "select f in (select tuple((a, b)) from tab_00386) from tab_00386"
yield 'select tuple(f) in (select tuple(tuple((a, b))) from tab_00386) from tab_00386' yield "select tuple(f) in (select tuple(tuple((a, b))) from tab_00386) from tab_00386"
import requests import requests
import os import os
def main(): def main():
url = os.environ['CLICKHOUSE_URL'] url = os.environ["CLICKHOUSE_URL"]
for q in gen_queries(): for q in gen_queries():
resp = requests.post(url, data=q) resp = requests.post(url, data=q)
if resp.status_code != 200 or resp.text.strip() not in ('1', ''): if resp.status_code != 200 or resp.text.strip() not in ("1", ""):
print('Query:', q) print("Query:", q)
print('Code:', resp.status_code) print("Code:", resp.status_code)
print(resp.text) print(resp.text)
break break
requests.post(url, data='drop table tab_00386') requests.post(url, data="drop table tab_00386")
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -2,8 +2,20 @@
import os, itertools, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, sys import os, itertools, urllib.request, urllib.parse, urllib.error, urllib.request, urllib.error, urllib.parse, sys
def get_ch_answer(query): def get_ch_answer(query):
return urllib.request.urlopen(os.environ.get('CLICKHOUSE_URL', 'http://localhost:' + os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') ), data=query.encode()).read().decode() return (
urllib.request.urlopen(
os.environ.get(
"CLICKHOUSE_URL",
"http://localhost:" + os.environ.get("CLICKHOUSE_PORT_HTTP", "8123"),
),
data=query.encode(),
)
.read()
.decode()
)
def check_answers(query, answer): def check_answers(query, answer):
ch_answer = get_ch_answer(query) ch_answer = get_ch_answer(query)
@ -13,36 +25,34 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer) print("Fetched answer :", ch_answer)
exit(-1) exit(-1)
def get_values(): def get_values():
values = [0, 1, -1] values = [0, 1, -1]
for bits in [8, 16, 32, 64]: for bits in [8, 16, 32, 64]:
values += [2**bits, 2**bits - 1] values += [2**bits, 2**bits - 1]
values += [2**(bits-1) - 1, 2**(bits-1), 2**(bits-1) + 1] values += [2 ** (bits - 1) - 1, 2 ** (bits - 1), 2 ** (bits - 1) + 1]
values += [-2**(bits-1) - 1, -2**(bits-1), -2**(bits-1) + 1] values += [-(2 ** (bits - 1)) - 1, -(2 ** (bits - 1)), -(2 ** (bits - 1)) + 1]
return values return values
def is_valid_integer(x): def is_valid_integer(x):
return -2**63 <= x and x <= 2**64-1 return -(2**63) <= x and x <= 2**64 - 1
TEST_WITH_CASTING=True TEST_WITH_CASTING = True
GENERATE_TEST_FILES=False GENERATE_TEST_FILES = False
TYPES = { TYPES = {
"UInt8" : { "bits" : 8, "sign" : False, "float" : False }, "UInt8": {"bits": 8, "sign": False, "float": False},
"Int8" : { "bits" : 8, "sign" : True, "float" : False }, "Int8": {"bits": 8, "sign": True, "float": False},
"UInt16": {"bits": 16, "sign": False, "float": False},
"UInt16": { "bits" : 16, "sign" : False, "float" : False }, "Int16": {"bits": 16, "sign": True, "float": False},
"Int16" : { "bits" : 16, "sign" : True, "float" : False }, "UInt32": {"bits": 32, "sign": False, "float": False},
"Int32": {"bits": 32, "sign": True, "float": False},
"UInt32": { "bits" : 32, "sign" : False, "float" : False }, "UInt64": {"bits": 64, "sign": False, "float": False},
"Int32" : { "bits" : 32, "sign" : True, "float" : False }, "Int64": {"bits": 64, "sign": True, "float": False}
# "Float32" : { "bits" : 32, "sign" : True, "float" : True },
"UInt64": { "bits" : 64, "sign" : False, "float" : False }, # "Float64" : { "bits" : 64, "sign" : True, "float" : True }
"Int64" : { "bits" : 64, "sign" : True, "float" : False }
#"Float32" : { "bits" : 32, "sign" : True, "float" : True },
#"Float64" : { "bits" : 64, "sign" : True, "float" : True }
} }
@ -55,14 +65,18 @@ def inside_range(value, type_name):
return True return True
if signed: if signed:
return -2**(bits-1) <= value and value <= 2**(bits-1) - 1 return -(2 ** (bits - 1)) <= value and value <= 2 ** (bits - 1) - 1
else: else:
return 0 <= value and value <= 2**bits - 1 return 0 <= value and value <= 2**bits - 1
def test_operators(v1, v2, v1_passed, v2_passed): def test_operators(v1, v2, v1_passed, v2_passed):
query_str = "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2},\t".format(v1=v1_passed, v2=v2_passed) query_str = "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2},\t".format(
query_str += "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2} ".format(v1=v2_passed, v2=v1_passed) v1=v1_passed, v2=v2_passed
)
query_str += "{v1} = {v2}, {v1} != {v2}, {v1} < {v2}, {v1} <= {v2}, {v1} > {v2}, {v1} >= {v2} ".format(
v1=v2_passed, v2=v1_passed
)
answers = [v1 == v2, v1 != v2, v1 < v2, v1 <= v2, v1 > v2, v1 >= v2] answers = [v1 == v2, v1 != v2, v1 < v2, v1 <= v2, v1 > v2, v1 >= v2]
answers += [v2 == v1, v2 != v1, v2 < v1, v2 <= v1, v2 > v1, v2 >= v1] answers += [v2 == v1, v2 != v1, v2 < v1, v2 <= v1, v2 > v1, v2 >= v1]
@ -74,6 +88,7 @@ def test_operators(v1, v2, v1_passed, v2_passed):
VALUES = [x for x in get_values() if is_valid_integer(x)] VALUES = [x for x in get_values() if is_valid_integer(x)]
def test_pair(v1, v2): def test_pair(v1, v2):
query = "SELECT {}, {}, ".format(v1, v2) query = "SELECT {}, {}, ".format(v1, v2)
answers = "{}\t{}\t".format(v1, v2) answers = "{}\t{}\t".format(v1, v2)
@ -87,19 +102,58 @@ def test_pair(v1, v2):
if inside_range(v1, t1): if inside_range(v1, t1):
for t2 in TYPES.keys(): for t2 in TYPES.keys():
if inside_range(v2, t2): if inside_range(v2, t2):
q, a = test_operators(v1, v2, 'to{}({})'.format(t1, v1), 'to{}({})'.format(t2, v2)) q, a = test_operators(
query += ', ' + q v1, v2, "to{}({})".format(t1, v1), "to{}({})".format(t2, v2)
)
query += ", " + q
answers += "\t" + a answers += "\t" + a
check_answers(query, answers) check_answers(query, answers)
return query, answers return query, answers
VALUES_INT = [0, -1, 1, 2**64-1, 2**63, -2**63, 2**63-1, 2**51, 2**52, 2**53-1, 2**53, 2**53+1, 2**53+2, -2**53+1, -2**53, -2**53-1, -2**53-2, 2*52, -2**52] VALUES_INT = [
VALUES_FLOAT = [float(x) for x in VALUES_INT + [-0.5, 0.5, -1.5, 1.5, 2**53, 2**51 - 0.5, 2**51 + 0.5, 2**60, -2**60, -2**63 - 10000, 2**63 + 10000]] 0,
-1,
1,
2**64 - 1,
2**63,
-(2**63),
2**63 - 1,
2**51,
2**52,
2**53 - 1,
2**53,
2**53 + 1,
2**53 + 2,
-(2**53) + 1,
-(2**53),
-(2**53) - 1,
-(2**53) - 2,
2 * 52,
-(2**52),
]
VALUES_FLOAT = [
float(x)
for x in VALUES_INT
+ [
-0.5,
0.5,
-1.5,
1.5,
2**53,
2**51 - 0.5,
2**51 + 0.5,
2**60,
-(2**60),
-(2**63) - 10000,
2**63 + 10000,
]
]
def test_float_pair(i, f): def test_float_pair(i, f):
f_str = ("%.9f" % f) f_str = "%.9f" % f
query = "SELECT '{}', '{}', ".format(i, f_str) query = "SELECT '{}', '{}', ".format(i, f_str)
answers = "{}\t{}\t".format(i, f_str) answers = "{}\t{}\t".format(i, f_str)
@ -110,8 +164,8 @@ def test_float_pair(i, f):
if TEST_WITH_CASTING: if TEST_WITH_CASTING:
for t1 in TYPES.keys(): for t1 in TYPES.keys():
if inside_range(i, t1): if inside_range(i, t1):
q, a = test_operators(i, f, 'to{}({})'.format(t1, i), f_str) q, a = test_operators(i, f, "to{}({})".format(t1, i), f_str)
query += ', ' + q query += ", " + q
answers += "\t" + a answers += "\t" + a
check_answers(query, answers) check_answers(query, answers)
@ -120,22 +174,26 @@ def test_float_pair(i, f):
def main(): def main():
if GENERATE_TEST_FILES: if GENERATE_TEST_FILES:
base_name = '00411_accurate_number_comparison' base_name = "00411_accurate_number_comparison"
sql_file = open(base_name + '.sql', 'wt') sql_file = open(base_name + ".sql", "wt")
ref_file = open(base_name + '.reference', 'wt') ref_file = open(base_name + ".reference", "wt")
num_int_tests = len(list(itertools.combinations(VALUES, 2))) num_int_tests = len(list(itertools.combinations(VALUES, 2)))
num_parts = 4 num_parts = 4
for part in range(0, num_parts): for part in range(0, num_parts):
if 'int' + str(part + 1) in sys.argv[1:]: if "int" + str(part + 1) in sys.argv[1:]:
for (v1, v2) in itertools.islice(itertools.combinations(VALUES, 2), part * num_int_tests // num_parts, (part + 1) * num_int_tests // num_parts): for (v1, v2) in itertools.islice(
itertools.combinations(VALUES, 2),
part * num_int_tests // num_parts,
(part + 1) * num_int_tests // num_parts,
):
q, a = test_pair(v1, v2) q, a = test_pair(v1, v2)
if GENERATE_TEST_FILES: if GENERATE_TEST_FILES:
sql_file.write(q + ";\n") sql_file.write(q + ";\n")
ref_file.write(a + "\n") ref_file.write(a + "\n")
if 'float' in sys.argv[1:]: if "float" in sys.argv[1:]:
for (i, f) in itertools.product(VALUES_INT, VALUES_FLOAT): for (i, f) in itertools.product(VALUES_INT, VALUES_FLOAT):
q, a = test_float_pair(i, f) q, a = test_float_pair(i, f)
if GENERATE_TEST_FILES: if GENERATE_TEST_FILES:

View File

@ -12,6 +12,7 @@ import subprocess
from io import StringIO from io import StringIO
from http.server import BaseHTTPRequestHandler, HTTPServer from http.server import BaseHTTPRequestHandler, HTTPServer
def is_ipv6(host): def is_ipv6(host):
try: try:
socket.inet_aton(host) socket.inet_aton(host)
@ -19,6 +20,7 @@ def is_ipv6(host):
except: except:
return True return True
def get_local_port(host, ipv6): def get_local_port(host, ipv6):
if ipv6: if ipv6:
family = socket.AF_INET6 family = socket.AF_INET6
@ -29,8 +31,9 @@ def get_local_port(host, ipv6):
fd.bind((host, 0)) fd.bind((host, 0))
return fd.getsockname()[1] return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
##################################################################################### #####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table. # This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -39,27 +42,42 @@ CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
##################################################################################### #####################################################################################
# IP-address of this host accessible from the outside world. Get the first one # IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0] HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST) IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6) HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script. # IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6: if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else: else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
CSV_DATA = os.path.join(
tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
)
CSV_DATA = os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
def get_ch_answer(query): def get_ch_answer(query):
host = CLICKHOUSE_HOST host = CLICKHOUSE_HOST
if IS_IPV6: if IS_IPV6:
host = f'[{host}]' host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP)) url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode() return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer): def check_answers(query, answer):
ch_answer = get_ch_answer(query) ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip(): if ch_answer.strip() != answer.strip():
@ -68,18 +86,19 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr) print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query") raise Exception("Fail on query")
class CSVHTTPServer(BaseHTTPRequestHandler): class CSVHTTPServer(BaseHTTPRequestHandler):
def _set_headers(self): def _set_headers(self):
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'text/csv') self.send_header("Content-type", "text/csv")
self.end_headers() self.end_headers()
def do_GET(self): def do_GET(self):
self._set_headers() self._set_headers()
with open(CSV_DATA, 'r') as fl: with open(CSV_DATA, "r") as fl:
reader = csv.reader(fl, delimiter=',') reader = csv.reader(fl, delimiter=",")
for row in reader: for row in reader:
self.wfile.write((', '.join(row) + '\n').encode()) self.wfile.write((", ".join(row) + "\n").encode())
return return
def do_HEAD(self): def do_HEAD(self):
@ -87,33 +106,33 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
return return
def read_chunk(self): def read_chunk(self):
msg = '' msg = ""
while True: while True:
sym = self.rfile.read(1) sym = self.rfile.read(1)
if sym == '': if sym == "":
break break
msg += sym.decode('utf-8') msg += sym.decode("utf-8")
if msg.endswith('\r\n'): if msg.endswith("\r\n"):
break break
length = int(msg[:-2], 16) length = int(msg[:-2], 16)
if length == 0: if length == 0:
return '' return ""
content = self.rfile.read(length) content = self.rfile.read(length)
self.rfile.read(2) # read sep \r\n self.rfile.read(2) # read sep \r\n
return content.decode('utf-8') return content.decode("utf-8")
def do_POST(self): def do_POST(self):
data = '' data = ""
while True: while True:
chunk = self.read_chunk() chunk = self.read_chunk()
if not chunk: if not chunk:
break break
data += chunk data += chunk
with StringIO(data) as fl: with StringIO(data) as fl:
reader = csv.reader(fl, delimiter=',') reader = csv.reader(fl, delimiter=",")
with open(CSV_DATA, 'a') as d: with open(CSV_DATA, "a") as d:
for row in reader: for row in reader:
d.write(','.join(row) + '\n') d.write(",".join(row) + "\n")
self._set_headers() self._set_headers()
self.wfile.write(b"ok") self.wfile.write(b"ok")
@ -124,6 +143,7 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
class HTTPServerV6(HTTPServer): class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6 address_family = socket.AF_INET6
def start_server(): def start_server():
if IS_IPV6: if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, CSVHTTPServer) httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, CSVHTTPServer)
@ -133,49 +153,76 @@ def start_server():
t = threading.Thread(target=httpd.serve_forever) t = threading.Thread(target=httpd.serve_forever)
return t, httpd return t, httpd
# test section # test section
def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests=[], answers=[], test_data=""):
with open(CSV_DATA, 'w') as f: # clear file def test_select(
f.write('') table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests=[],
answers=[],
test_data="",
):
with open(CSV_DATA, "w") as f: # clear file
f.write("")
if test_data: if test_data:
with open(CSV_DATA, 'w') as f: with open(CSV_DATA, "w") as f:
f.write(test_data + "\n") f.write(test_data + "\n")
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR)) get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for i in range(len(requests)): for i in range(len(requests)):
tbl = table_name tbl = table_name
if not tbl: if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema) tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests[i].format(tbl=tbl), answers[i]) check_answers(requests[i].format(tbl=tbl), answers[i])
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]): def test_insert(
with open(CSV_DATA, 'w') as f: # flush test file table_name="",
f.write('') schema="str String,numuint UInt32,numint Int32,double Float64",
requests_insert=[],
requests_select=[],
answers=[],
):
with open(CSV_DATA, "w") as f: # flush test file
f.write("")
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR)) get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for req in requests_insert: for req in requests_insert:
tbl = table_name tbl = table_name
if not tbl: if not tbl:
tbl = "table function url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema) tbl = "table function url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
get_ch_answer(req.format(tbl=tbl)) get_ch_answer(req.format(tbl=tbl))
for i in range(len(requests_select)): for i in range(len(requests_select)):
tbl = table_name tbl = table_name
if not tbl: if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema) tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests_select[i].format(tbl=tbl), answers[i]) check_answers(requests_select[i].format(tbl=tbl), answers[i])
if table_name: if table_name:
@ -185,9 +232,11 @@ def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,do
def main(): def main():
test_data = "Hello,2,-2,7.7\nWorld,2,-5,8.8" test_data = "Hello,2,-2,7.7\nWorld,2,-5,8.8"
select_only_requests = { select_only_requests = {
"select str,numuint,numint,double from {tbl}" : test_data.replace(',', '\t'), "select str,numuint,numint,double from {tbl}": test_data.replace(",", "\t"),
"select numuint, count(*) from {tbl} group by numuint" : "2\t2", "select numuint, count(*) from {tbl} group by numuint": "2\t2",
"select str,numuint,numint,double from {tbl} limit 1": test_data.split("\n")[0].replace(',', '\t'), "select str,numuint,numint,double from {tbl} limit 1": test_data.split("\n")[
0
].replace(",", "\t"),
} }
insert_requests = [ insert_requests = [
@ -196,21 +245,41 @@ def main():
] ]
select_requests = { select_requests = {
"select distinct numuint from {tbl} order by numuint": '\n'.join([str(i) for i in range(11)]), "select distinct numuint from {tbl} order by numuint": "\n".join(
"select count(*) from {tbl}": '12', [str(i) for i in range(11)]
'select double, count(*) from {tbl} group by double order by double': "7.7\t2\n9.9\t10" ),
"select count(*) from {tbl}": "12",
"select double, count(*) from {tbl} group by double order by double": "7.7\t2\n9.9\t10",
} }
t, httpd = start_server() t, httpd = start_server()
t.start() t.start()
# test table with url engine # test table with url engine
test_select(table_name="test_table_select", requests=list(select_only_requests.keys()), answers=list(select_only_requests.values()), test_data=test_data) test_select(
table_name="test_table_select",
requests=list(select_only_requests.keys()),
answers=list(select_only_requests.values()),
test_data=test_data,
)
# test table function url # test table function url
test_select(requests=list(select_only_requests.keys()), answers=list(select_only_requests.values()), test_data=test_data) test_select(
#test insert into table with url engine requests=list(select_only_requests.keys()),
test_insert(table_name="test_table_insert", requests_insert=insert_requests, requests_select=list(select_requests.keys()), answers=list(select_requests.values())) answers=list(select_only_requests.values()),
#test insert into table function url test_data=test_data,
test_insert(requests_insert=insert_requests, requests_select=list(select_requests.keys()), answers=list(select_requests.values())) )
# test insert into table with url engine
test_insert(
table_name="test_table_insert",
requests_insert=insert_requests,
requests_select=list(select_requests.keys()),
answers=list(select_requests.values()),
)
# test insert into table function url
test_insert(
requests_insert=insert_requests,
requests_select=list(select_requests.keys()),
answers=list(select_requests.values()),
)
httpd.shutdown() httpd.shutdown()
t.join() t.join()

View File

@ -12,35 +12,46 @@ HAYSTACKS = [
NEEDLE = "needle" NEEDLE = "needle"
HAY_RE = re.compile(r'\bhay\b', re.IGNORECASE) HAY_RE = re.compile(r"\bhay\b", re.IGNORECASE)
NEEDLE_RE = re.compile(r'\bneedle\b', re.IGNORECASE) NEEDLE_RE = re.compile(r"\bneedle\b", re.IGNORECASE)
def replace_follow_case(replacement): def replace_follow_case(replacement):
def func(match): def func(match):
g = match.group() g = match.group()
if g.islower(): return replacement.lower() if g.islower():
if g.istitle(): return replacement.title() return replacement.lower()
if g.isupper(): return replacement.upper() if g.istitle():
return replacement.title()
if g.isupper():
return replacement.upper()
return replacement return replacement
return func return func
def replace_separators(query, new_sep): def replace_separators(query, new_sep):
SEP_RE = re.compile('\\s+') SEP_RE = re.compile("\\s+")
result = SEP_RE.sub(new_sep, query) result = SEP_RE.sub(new_sep, query)
return result return result
def enlarge_haystack(query, times, separator=''):
return HAY_RE.sub(replace_follow_case(('hay' + separator) * times), query) def enlarge_haystack(query, times, separator=""):
return HAY_RE.sub(replace_follow_case(("hay" + separator) * times), query)
def small_needle(query): def small_needle(query):
return NEEDLE_RE.sub(replace_follow_case('n'), query) return NEEDLE_RE.sub(replace_follow_case("n"), query)
def remove_needle(query): def remove_needle(query):
return NEEDLE_RE.sub('', query) return NEEDLE_RE.sub("", query)
def replace_needle(query, new_needle): def replace_needle(query, new_needle):
return NEEDLE_RE.sub(new_needle, query) return NEEDLE_RE.sub(new_needle, query)
# with str.lower, str.uppert, str.title and such # with str.lower, str.uppert, str.title and such
def transform_needle(query, string_transformation_func): def transform_needle(query, string_transformation_func):
def replace_with_transformation(match): def replace_with_transformation(match):
@ -49,19 +60,21 @@ def transform_needle(query, string_transformation_func):
return NEEDLE_RE.sub(replace_with_transformation, query) return NEEDLE_RE.sub(replace_with_transformation, query)
def create_cases(case_sensitive_func, case_insensitive_func, table_row_template, table_query_template, const_query_template):
def create_cases(
case_sensitive_func,
case_insensitive_func,
table_row_template,
table_query_template,
const_query_template,
):
const_queries = [] const_queries = []
table_rows = [] table_rows = []
table_queries = set() table_queries = set()
def add_case(func, haystack, needle, match): def add_case(func, haystack, needle, match):
match = int(match) match = int(match)
args = dict( args = dict(func=func, haystack=haystack, needle=needle, match=match)
func = func,
haystack = haystack,
needle = needle,
match = match
)
const_queries.append(const_query_template.substitute(args)) const_queries.append(const_query_template.substitute(args))
table_queries.add(table_query_template.substitute(args)) table_queries.add(table_query_template.substitute(args))
table_rows.append(table_row_template.substitute(args)) table_rows.append(table_row_template.substitute(args))
@ -69,14 +82,28 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
def add_case_sensitive(haystack, needle, match): def add_case_sensitive(haystack, needle, match):
add_case(case_sensitive_func, haystack, needle, match) add_case(case_sensitive_func, haystack, needle, match)
if match: if match:
add_case(case_sensitive_func, transform_needle(haystack, str.swapcase), transform_needle(needle, str.swapcase), match) add_case(
case_sensitive_func,
transform_needle(haystack, str.swapcase),
transform_needle(needle, str.swapcase),
match,
)
def add_case_insensitive(haystack, needle, match): def add_case_insensitive(haystack, needle, match):
add_case(case_insensitive_func, haystack, needle, match) add_case(case_insensitive_func, haystack, needle, match)
if match: if match:
add_case(case_insensitive_func, transform_needle(haystack, str.swapcase), needle, match) add_case(
add_case(case_insensitive_func, haystack, transform_needle(needle, str.swapcase), match) case_insensitive_func,
transform_needle(haystack, str.swapcase),
needle,
match,
)
add_case(
case_insensitive_func,
haystack,
transform_needle(needle, str.swapcase),
match,
)
# Negative cases # Negative cases
add_case_sensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False) add_case_sensitive(remove_needle(HAYSTACKS[0]), NEEDLE, False)
@ -85,7 +112,7 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
for haystack in HAYSTACKS: for haystack in HAYSTACKS:
add_case_sensitive(transform_needle(haystack, str.swapcase), NEEDLE, False) add_case_sensitive(transform_needle(haystack, str.swapcase), NEEDLE, False)
sep = '' sep = ""
h = replace_separators(haystack, sep) h = replace_separators(haystack, sep)
add_case_sensitive(h, NEEDLE, False) add_case_sensitive(h, NEEDLE, False)
@ -102,8 +129,7 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
add_case_sensitive(haystack, NEEDLE, True) add_case_sensitive(haystack, NEEDLE, True)
add_case_insensitive(haystack, NEEDLE, True) add_case_insensitive(haystack, NEEDLE, True)
for sep in list(""" ,"""):
for sep in list(''' ,'''):
h = replace_separators(haystack, sep) h = replace_separators(haystack, sep)
add_case_sensitive(h, NEEDLE, True) add_case_sensitive(h, NEEDLE, True)
add_case_sensitive(small_needle(h), small_needle(NEEDLE), True) add_case_sensitive(small_needle(h), small_needle(NEEDLE), True)
@ -114,32 +140,43 @@ def create_cases(case_sensitive_func, case_insensitive_func, table_row_template,
add_case_insensitive(enlarge_haystack(h, 200, sep), NEEDLE, True) add_case_insensitive(enlarge_haystack(h, 200, sep), NEEDLE, True)
# case insesitivity works only on ASCII strings # case insesitivity works only on ASCII strings
add_case_sensitive(replace_needle(h, 'иголка'), replace_needle(NEEDLE, 'иголка'), True) add_case_sensitive(
add_case_sensitive(replace_needle(h, '指针'), replace_needle(NEEDLE, '指针'), True) replace_needle(h, "иголка"), replace_needle(NEEDLE, "иголка"), True
)
add_case_sensitive(
replace_needle(h, "指针"), replace_needle(NEEDLE, "指针"), True
)
for sep in list('''~!@$%^&*()-=+|]}[{";:/?.><\t''') + [r'\\\\']: for sep in list("""~!@$%^&*()-=+|]}[{";:/?.><\t""") + [r"\\\\"]:
h = replace_separators(HAYSTACKS[0], sep) h = replace_separators(HAYSTACKS[0], sep)
add_case(case_sensitive_func, h, NEEDLE, True) add_case(case_sensitive_func, h, NEEDLE, True)
return table_rows, table_queries, const_queries return table_rows, table_queries, const_queries
def main():
def main():
def query(x): def query(x):
print(x) print(x)
CONST_QUERY = Template("""SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};""") CONST_QUERY = Template(
TABLE_QUERY = Template("""WITH '${needle}' as n """SELECT ${func}('${haystack}', '${needle}'), ' expecting ', ${match};"""
)
TABLE_QUERY = Template(
"""WITH '${needle}' as n
SELECT haystack, needle, ${func}(haystack, n) as result SELECT haystack, needle, ${func}(haystack, n) as result
FROM ht FROM ht
WHERE func = '${func}' AND needle = n AND result != match;""") WHERE func = '${func}' AND needle = n AND result != match;"""
)
TABLE_ROW = Template("""('${haystack}', '${needle}', ${match}, '${func}')""") TABLE_ROW = Template("""('${haystack}', '${needle}', ${match}, '${func}')""")
rows, table_queries, const_queries = create_cases('hasToken', 'hasTokenCaseInsensitive', TABLE_ROW, TABLE_QUERY, CONST_QUERY) rows, table_queries, const_queries = create_cases(
"hasToken", "hasTokenCaseInsensitive", TABLE_ROW, TABLE_QUERY, CONST_QUERY
)
for q in const_queries: for q in const_queries:
query(q) query(q)
query("""DROP TABLE IF EXISTS ht; query(
"""DROP TABLE IF EXISTS ht;
CREATE TABLE IF NOT EXISTS CREATE TABLE IF NOT EXISTS
ht ht
( (
@ -150,11 +187,15 @@ def main():
) )
ENGINE MergeTree() ENGINE MergeTree()
ORDER BY haystack; ORDER BY haystack;
INSERT INTO ht VALUES {values};""".format(values=", ".join(rows))) INSERT INTO ht VALUES {values};""".format(
values=", ".join(rows)
)
)
for q in sorted(table_queries): for q in sorted(table_queries):
query(q) query(q)
query("""DROP TABLE ht""") query("""DROP TABLE ht""")
if __name__ == '__main__':
if __name__ == "__main__":
main() main()

View File

@ -8,28 +8,32 @@ import sys
import signal import signal
CLICKHOUSE_CLIENT = os.environ.get('CLICKHOUSE_CLIENT') CLICKHOUSE_CLIENT = os.environ.get("CLICKHOUSE_CLIENT")
CLICKHOUSE_CURL = os.environ.get('CLICKHOUSE_CURL') CLICKHOUSE_CURL = os.environ.get("CLICKHOUSE_CURL")
CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL') CLICKHOUSE_URL = os.environ.get("CLICKHOUSE_URL")
def send_query(query): def send_query(query):
cmd = list(CLICKHOUSE_CLIENT.split()) cmd = list(CLICKHOUSE_CLIENT.split())
cmd += ['--query', query] cmd += ["--query", query]
# print(cmd) # print(cmd)
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).stdout
def send_query_in_process_group(query): def send_query_in_process_group(query):
cmd = list(CLICKHOUSE_CLIENT.split()) cmd = list(CLICKHOUSE_CLIENT.split())
cmd += ['--query', query] cmd += ["--query", query]
# print(cmd) # print(cmd)
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid) return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, preexec_fn=os.setsid
)
def read_lines_and_push_to_queue(pipe, queue): def read_lines_and_push_to_queue(pipe, queue):
try: try:
for line in iter(pipe.readline, ''): for line in iter(pipe.readline, ""):
line = line.strip() line = line.strip()
print(line) print(line)
sys.stdout.flush() sys.stdout.flush()
@ -41,41 +45,44 @@ def read_lines_and_push_to_queue(pipe, queue):
def test(): def test():
send_query('DROP TABLE IF EXISTS test.lv').read() send_query("DROP TABLE IF EXISTS test.lv").read()
send_query('DROP TABLE IF EXISTS test.mt').read() send_query("DROP TABLE IF EXISTS test.mt").read()
send_query('CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()').read() send_query(
send_query('CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt').read() "CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()"
).read()
send_query("CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt").read()
q = queue.Queue() q = queue.Queue()
p = send_query_in_process_group('WATCH test.lv') p = send_query_in_process_group("WATCH test.lv")
thread = threading.Thread(target=read_lines_and_push_to_queue, args=(p.stdout, q)) thread = threading.Thread(target=read_lines_and_push_to_queue, args=(p.stdout, q))
thread.start() thread.start()
line = q.get() line = q.get()
print(line) print(line)
assert (line == '0\t1') assert line == "0\t1"
send_query('INSERT INTO test.mt VALUES (1),(2),(3)').read() send_query("INSERT INTO test.mt VALUES (1),(2),(3)").read()
line = q.get() line = q.get()
print(line) print(line)
assert (line == '6\t2') assert line == "6\t2"
send_query('INSERT INTO test.mt VALUES (4),(5),(6)').read() send_query("INSERT INTO test.mt VALUES (4),(5),(6)").read()
line = q.get() line = q.get()
print(line) print(line)
assert (line == '21\t3') assert line == "21\t3"
# Send Ctrl+C to client. # Send Ctrl+C to client.
os.killpg(os.getpgid(p.pid), signal.SIGINT) os.killpg(os.getpgid(p.pid), signal.SIGINT)
# This insert shouldn't affect lv. # This insert shouldn't affect lv.
send_query('INSERT INTO test.mt VALUES (7),(8),(9)').read() send_query("INSERT INTO test.mt VALUES (7),(8),(9)").read()
line = q.get() line = q.get()
print(line) print(line)
assert (line is None) assert line is None
send_query('DROP TABLE if exists test.lv').read() send_query("DROP TABLE if exists test.lv").read()
send_query('DROP TABLE if exists test.lv').read() send_query("DROP TABLE if exists test.lv").read()
thread.join() thread.join()
test() test()

View File

@ -7,26 +7,30 @@ import os
import sys import sys
CLICKHOUSE_CLIENT = os.environ.get('CLICKHOUSE_CLIENT') CLICKHOUSE_CLIENT = os.environ.get("CLICKHOUSE_CLIENT")
CLICKHOUSE_CURL = os.environ.get('CLICKHOUSE_CURL') CLICKHOUSE_CURL = os.environ.get("CLICKHOUSE_CURL")
CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL') CLICKHOUSE_URL = os.environ.get("CLICKHOUSE_URL")
def send_query(query): def send_query(query):
cmd = list(CLICKHOUSE_CLIENT.split()) cmd = list(CLICKHOUSE_CLIENT.split())
cmd += ['--query', query] cmd += ["--query", query]
# print(cmd) # print(cmd)
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).stdout
def send_http_query(query): def send_http_query(query):
cmd = list(CLICKHOUSE_CURL.split()) # list(['curl', '-sSN', '--max-time', '10']) cmd = list(CLICKHOUSE_CURL.split()) # list(['curl', '-sSN', '--max-time', '10'])
cmd += ['-sSN', CLICKHOUSE_URL, '-d', query] cmd += ["-sSN", CLICKHOUSE_URL, "-d", query]
return subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT).stdout return subprocess.Popen(
cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
).stdout
def read_lines_and_push_to_queue(pipe, queue): def read_lines_and_push_to_queue(pipe, queue):
for line in iter(pipe.readline, ''): for line in iter(pipe.readline, ""):
line = line.strip() line = line.strip()
print(line) print(line)
sys.stdout.flush() sys.stdout.flush()
@ -36,28 +40,31 @@ def read_lines_and_push_to_queue(pipe, queue):
def test(): def test():
send_query('DROP TABLE IF EXISTS test.lv').read() send_query("DROP TABLE IF EXISTS test.lv").read()
send_query('DROP TABLE IF EXISTS test.mt').read() send_query("DROP TABLE IF EXISTS test.mt").read()
send_query('CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()').read() send_query(
send_query('CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt').read() "CREATE TABLE test.mt (a Int32) Engine=MergeTree order by tuple()"
).read()
send_query("CREATE LIVE VIEW test.lv AS SELECT sum(a) FROM test.mt").read()
q = queue.Queue() q = queue.Queue()
pipe = send_http_query('WATCH test.lv') pipe = send_http_query("WATCH test.lv")
thread = threading.Thread(target=read_lines_and_push_to_queue, args=(pipe, q)) thread = threading.Thread(target=read_lines_and_push_to_queue, args=(pipe, q))
thread.start() thread.start()
line = q.get() line = q.get()
print(line) print(line)
assert (line == '0\t1') assert line == "0\t1"
send_query('INSERT INTO test.mt VALUES (1),(2),(3)').read() send_query("INSERT INTO test.mt VALUES (1),(2),(3)").read()
line = q.get() line = q.get()
print(line) print(line)
assert (line == '6\t2') assert line == "6\t2"
send_query('DROP TABLE if exists test.lv').read() send_query("DROP TABLE if exists test.lv").read()
send_query('DROP TABLE if exists test.lv').read() send_query("DROP TABLE if exists test.lv").read()
thread.join() thread.join()
test() test()

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import sys import sys
from scipy import stats from scipy import stats
@ -6,70 +6,86 @@ import pandas as pd
import numpy as np import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value, precision=1e-2): def test_and_check(name, a, b, t_stat, p_value, precision=1e-2):
client = ClickHouseClient() client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS ttest;") client.query("DROP TABLE IF EXISTS ttest;")
client.query("CREATE TABLE ttest (left Float64, right UInt8) ENGINE = Memory;"); client.query("CREATE TABLE ttest (left Float64, right UInt8) ENGINE = Memory;")
client.query("INSERT INTO ttest VALUES {};".format(", ".join(['({},{})'.format(i, 0) for i in a]))) client.query(
client.query("INSERT INTO ttest VALUES {};".format(", ".join(['({},{})'.format(j, 1) for j in b]))) "INSERT INTO ttest VALUES {};".format(
", ".join(["({},{})".format(i, 0) for i in a])
)
)
client.query(
"INSERT INTO ttest VALUES {};".format(
", ".join(["({},{})".format(j, 1) for j in b])
)
)
real = client.query_return_df( real = client.query_return_df(
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) + "SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
"roundBankers({}(left, right).2, 16) as p_value ".format(name) + + "roundBankers({}(left, right).2, 16) as p_value ".format(name)
"FROM ttest FORMAT TabSeparatedWithNames;") + "FROM ttest FORMAT TabSeparatedWithNames;"
real_t_stat = real['t_stat'][0] )
real_p_value = real['p_value'][0] real_t_stat = real["t_stat"][0]
assert(abs(real_t_stat - np.float64(t_stat)) < precision), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat) real_p_value = real["p_value"][0]
assert(abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value) assert (
abs(real_t_stat - np.float64(t_stat)) < precision
), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
assert (
abs(real_p_value - np.float64(p_value)) < precision
), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
client.query("DROP TABLE IF EXISTS ttest;") client.query("DROP TABLE IF EXISTS ttest;")
def test_student(): def test_student():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 2) rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 2) rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p) test_and_check("studentTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2) rvs1 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2) rvs2 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p) test_and_check("studentTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10,size=512), 2) rvs1 = np.round(stats.norm.rvs(loc=2, scale=10, size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20,size=1024), 2) rvs2 = np.round(stats.norm.rvs(loc=5, scale=20, size=1024), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p) test_and_check("studentTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=1024), 2) rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10,size=512), 2) rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=512), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = True) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=True)
test_and_check("studentTTest", rvs1, rvs2, s, p) test_and_check("studentTTest", rvs1, rvs2, s, p)
def test_welch(): def test_welch():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=15,size=500), 2) rvs1 = np.round(stats.norm.rvs(loc=1, scale=15, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 2) rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p) test_and_check("welchTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=7,size=500), 2) rvs1 = np.round(stats.norm.rvs(loc=0, scale=7, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=3,size=500), 2) rvs2 = np.round(stats.norm.rvs(loc=0, scale=3, size=500), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p) test_and_check("welchTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=1024), 2) rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=1,size=512), 2) rvs2 = np.round(stats.norm.rvs(loc=5, scale=1, size=512), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p) test_and_check("welchTTest", rvs1, rvs2, s, p)
rvs1 = np.round(stats.norm.rvs(loc=5, scale=10,size=512), 2) rvs1 = np.round(stats.norm.rvs(loc=5, scale=10, size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=10,size=1024), 2) rvs2 = np.round(stats.norm.rvs(loc=5, scale=10, size=1024), 2)
s, p = stats.ttest_ind(rvs1, rvs2, equal_var = False) s, p = stats.ttest_ind(rvs1, rvs2, equal_var=False)
test_and_check("welchTTest", rvs1, rvs2, s, p) test_and_check("welchTTest", rvs1, rvs2, s, p)
if __name__ == "__main__": if __name__ == "__main__":
test_student() test_student()
test_welch() test_welch()
print("Ok.") print("Ok.")

View File

@ -6,7 +6,7 @@ import pandas as pd
import numpy as np import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
@ -14,40 +14,51 @@ from pure_http_client import ClickHouseClient
def test_and_check(name, a, b, t_stat, p_value): def test_and_check(name, a, b, t_stat, p_value):
client = ClickHouseClient() client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS mann_whitney;") client.query("DROP TABLE IF EXISTS mann_whitney;")
client.query("CREATE TABLE mann_whitney (left Float64, right UInt8) ENGINE = Memory;"); client.query(
client.query("INSERT INTO mann_whitney VALUES {};".format(", ".join(['({},{}), ({},{})'.format(i, 0, j, 1) for i,j in zip(a, b)]))) "CREATE TABLE mann_whitney (left Float64, right UInt8) ENGINE = Memory;"
)
client.query(
"INSERT INTO mann_whitney VALUES {};".format(
", ".join(["({},{}), ({},{})".format(i, 0, j, 1) for i, j in zip(a, b)])
)
)
real = client.query_return_df( real = client.query_return_df(
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) + "SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
"roundBankers({}(left, right).2, 16) as p_value ".format(name) + + "roundBankers({}(left, right).2, 16) as p_value ".format(name)
"FROM mann_whitney FORMAT TabSeparatedWithNames;") + "FROM mann_whitney FORMAT TabSeparatedWithNames;"
real_t_stat = real['t_stat'][0] )
real_p_value = real['p_value'][0] real_t_stat = real["t_stat"][0]
assert(abs(real_t_stat - np.float64(t_stat) < 1e-2)), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat) real_p_value = real["p_value"][0]
assert(abs(real_p_value - np.float64(p_value)) < 1e-2), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value) assert abs(
real_t_stat - np.float64(t_stat) < 1e-2
), "clickhouse_t_stat {}, scipy_t_stat {}".format(real_t_stat, t_stat)
assert (
abs(real_p_value - np.float64(p_value)) < 1e-2
), "clickhouse_p_value {}, scipy_p_value {}".format(real_p_value, p_value)
client.query("DROP TABLE IF EXISTS mann_whitney;") client.query("DROP TABLE IF EXISTS mann_whitney;")
def test_mann_whitney(): def test_mann_whitney():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 5) rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 5)
rvs2 = np.round(stats.expon.rvs(scale=0.2,size=500), 5) rvs2 = np.round(stats.expon.rvs(scale=0.2, size=500), 5)
s, p = stats.mannwhitneyu(rvs1, rvs2, alternative='two-sided') s, p = stats.mannwhitneyu(rvs1, rvs2, alternative="two-sided")
test_and_check("mannWhitneyUTest", rvs1, rvs2, s, p) test_and_check("mannWhitneyUTest", rvs1, rvs2, s, p)
test_and_check("mannWhitneyUTest('two-sided')", rvs1, rvs2, s, p) test_and_check("mannWhitneyUTest('two-sided')", rvs1, rvs2, s, p)
equal = np.round(stats.cauchy.rvs(scale=5, size=500), 5) equal = np.round(stats.cauchy.rvs(scale=5, size=500), 5)
s, p = stats.mannwhitneyu(equal, equal, alternative='two-sided') s, p = stats.mannwhitneyu(equal, equal, alternative="two-sided")
test_and_check("mannWhitneyUTest('two-sided')", equal, equal, s, p) test_and_check("mannWhitneyUTest('two-sided')", equal, equal, s, p)
s, p = stats.mannwhitneyu(equal, equal, alternative='less', use_continuity=False) s, p = stats.mannwhitneyu(equal, equal, alternative="less", use_continuity=False)
test_and_check("mannWhitneyUTest('less', 0)", equal, equal, s, p) test_and_check("mannWhitneyUTest('less', 0)", equal, equal, s, p)
rvs1 = np.round(stats.cauchy.rvs(scale=10, size=65536), 5)
rvs1 = np.round(stats.cauchy.rvs(scale=10,size=65536), 5) rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=65536), 5)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10,size=65536), 5) s, p = stats.mannwhitneyu(rvs1, rvs2, alternative="greater")
s, p = stats.mannwhitneyu(rvs1, rvs2, alternative='greater')
test_and_check("mannWhitneyUTest('greater')", rvs1, rvs2, s, p) test_and_check("mannWhitneyUTest('greater')", rvs1, rvs2, s, p)
if __name__ == "__main__": if __name__ == "__main__":
test_mann_whitney() test_mann_whitney()
print("Ok.") print("Ok.")

View File

@ -4,14 +4,18 @@ from random import randint, choices
import sys import sys
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
client = ClickHouseClient() client = ClickHouseClient()
N = 10 N = 10
create_query = "CREATE TABLE t_cnf_fuzz(" + ", ".join([f"c{i} UInt8" for i in range(N)]) + ") ENGINE = Memory" create_query = (
"CREATE TABLE t_cnf_fuzz("
+ ", ".join([f"c{i} UInt8" for i in range(N)])
+ ") ENGINE = Memory"
)
client.query("DROP TABLE IF EXISTS t_cnf_fuzz") client.query("DROP TABLE IF EXISTS t_cnf_fuzz")
client.query(create_query) client.query(create_query)
@ -35,6 +39,7 @@ client.query(insert_query)
MAX_CLAUSES = 10 MAX_CLAUSES = 10
MAX_ATOMS = 5 MAX_ATOMS = 5
def generate_dnf(): def generate_dnf():
clauses = [] clauses = []
num_clauses = randint(1, MAX_CLAUSES) num_clauses = randint(1, MAX_CLAUSES)
@ -42,12 +47,17 @@ def generate_dnf():
num_atoms = randint(1, MAX_ATOMS) num_atoms = randint(1, MAX_ATOMS)
atom_ids = choices(range(N), k=num_atoms) atom_ids = choices(range(N), k=num_atoms)
negates = choices([0, 1], k=num_atoms) negates = choices([0, 1], k=num_atoms)
atoms = [f"(NOT c{i})" if neg else f"c{i}" for (i, neg) in zip(atom_ids, negates)] atoms = [
f"(NOT c{i})" if neg else f"c{i}" for (i, neg) in zip(atom_ids, negates)
]
clauses.append("(" + " AND ".join(atoms) + ")") clauses.append("(" + " AND ".join(atoms) + ")")
return " OR ".join(clauses) return " OR ".join(clauses)
select_query = "SELECT count() FROM t_cnf_fuzz WHERE {} SETTINGS convert_query_to_cnf = {}"
select_query = (
"SELECT count() FROM t_cnf_fuzz WHERE {} SETTINGS convert_query_to_cnf = {}"
)
fail_report = """ fail_report = """
Failed query: '{}'. Failed query: '{}'.

View File

@ -5,15 +5,20 @@ import random
import string import string
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
def get_random_string(length): def get_random_string(length):
return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(length)) return "".join(
random.choice(string.ascii_uppercase + string.digits) for _ in range(length)
)
client = ClickHouseClient() client = ClickHouseClient()
def insert_block(table_name, block_granularity_rows, block_rows): def insert_block(table_name, block_granularity_rows, block_rows):
global client global client
block_data = [] block_data = []
@ -25,9 +30,12 @@ def insert_block(table_name, block_granularity_rows, block_rows):
values_row = ", ".join("(1, '" + row + "')" for row in block_data) values_row = ", ".join("(1, '" + row + "')" for row in block_data)
client.query("INSERT INTO {} VALUES {}".format(table_name, values_row)) client.query("INSERT INTO {} VALUES {}".format(table_name, values_row))
try: try:
client.query("DROP TABLE IF EXISTS t") client.query("DROP TABLE IF EXISTS t")
client.query("CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0") client.query(
"CREATE TABLE t (v UInt8, data String) ENGINE = MergeTree() ORDER BY tuple() SETTINGS min_bytes_for_wide_part = 0"
)
client.query("SYSTEM STOP MERGES t") client.query("SYSTEM STOP MERGES t")
@ -53,6 +61,10 @@ try:
client.query("SYSTEM START MERGES t") client.query("SYSTEM START MERGES t")
client.query("OPTIMIZE TABLE t FINAL") client.query("OPTIMIZE TABLE t FINAL")
print(client.query_return_df("SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames")['C'][0]) print(
client.query_return_df(
"SELECT COUNT() as C FROM t FORMAT TabSeparatedWithNames"
)["C"][0]
)
finally: finally:
client.query("DROP TABLE IF EXISTS t") client.query("DROP TABLE IF EXISTS t")

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from http.server import SimpleHTTPRequestHandler,HTTPServer from http.server import SimpleHTTPRequestHandler, HTTPServer
import socket import socket
import csv import csv
import sys import sys
@ -21,6 +21,7 @@ def is_ipv6(host):
except: except:
return True return True
def get_local_port(host, ipv6): def get_local_port(host, ipv6):
if ipv6: if ipv6:
family = socket.AF_INET6 family = socket.AF_INET6
@ -31,8 +32,9 @@ def get_local_port(host, ipv6):
fd.bind((host, 0)) fd.bind((host, 0))
return fd.getsockname()[1] return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', 'localhost')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "localhost")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
##################################################################################### #####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table. # This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -42,16 +44,24 @@ CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
##################################################################################### #####################################################################################
# IP-address of this host accessible from the outside world. Get the first one # IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0] HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST) IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6) HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script. # IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6: if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else: else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
# Because we need to check the content of file.csv we can create this content and avoid reading csv # Because we need to check the content of file.csv we can create this content and avoid reading csv
CSV_DATA = "Hello, 1\nWorld, 2\nThis, 152\nis, 9283\ntesting, 2313213\ndata, 555\n" CSV_DATA = "Hello, 1\nWorld, 2\nThis, 152\nis, 9283\ntesting, 2313213\ndata, 555\n"
@ -59,19 +69,24 @@ CSV_DATA = "Hello, 1\nWorld, 2\nThis, 152\nis, 9283\ntesting, 2313213\ndata, 555
# Choose compression method # Choose compression method
# (Will change during test, need to check standard data sending, to make sure that nothing broke) # (Will change during test, need to check standard data sending, to make sure that nothing broke)
COMPRESS_METHOD = 'none' COMPRESS_METHOD = "none"
ADDING_ENDING = '' ADDING_ENDING = ""
ENDINGS = ['.gz', '.xz'] ENDINGS = [".gz", ".xz"]
SEND_ENCODING = True SEND_ENCODING = True
def get_ch_answer(query): def get_ch_answer(query):
host = CLICKHOUSE_HOST host = CLICKHOUSE_HOST
if IS_IPV6: if IS_IPV6:
host = f'[{host}]' host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP)) url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode() return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer): def check_answers(query, answer):
ch_answer = get_ch_answer(query) ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip(): if ch_answer.strip() != answer.strip():
@ -80,18 +95,19 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr) print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query") raise Exception("Fail on query")
# Server with head method which is useful for debuging by hands # Server with head method which is useful for debuging by hands
class HttpProcessor(SimpleHTTPRequestHandler): class HttpProcessor(SimpleHTTPRequestHandler):
def _set_headers(self): def _set_headers(self):
self.send_response(200) self.send_response(200)
if SEND_ENCODING: if SEND_ENCODING:
self.send_header('Content-Encoding', COMPRESS_METHOD) self.send_header("Content-Encoding", COMPRESS_METHOD)
if COMPRESS_METHOD == 'none': if COMPRESS_METHOD == "none":
self.send_header('Content-Length', len(CSV_DATA.encode())) self.send_header("Content-Length", len(CSV_DATA.encode()))
else: else:
self.compress_data() self.compress_data()
self.send_header('Content-Length', len(self.data)) self.send_header("Content-Length", len(self.data))
self.send_header('Content-Type', 'text/csv') self.send_header("Content-Type", "text/csv")
self.end_headers() self.end_headers()
def do_HEAD(self): def do_HEAD(self):
@ -99,18 +115,17 @@ class HttpProcessor(SimpleHTTPRequestHandler):
return return
def compress_data(self): def compress_data(self):
if COMPRESS_METHOD == 'gzip': if COMPRESS_METHOD == "gzip":
self.data = gzip.compress((CSV_DATA).encode()) self.data = gzip.compress((CSV_DATA).encode())
elif COMPRESS_METHOD == 'lzma': elif COMPRESS_METHOD == "lzma":
self.data = lzma.compress((CSV_DATA).encode()) self.data = lzma.compress((CSV_DATA).encode())
else: else:
self.data = 'WRONG CONVERSATION'.encode() self.data = "WRONG CONVERSATION".encode()
def do_GET(self): def do_GET(self):
self._set_headers() self._set_headers()
if COMPRESS_METHOD == 'none': if COMPRESS_METHOD == "none":
self.wfile.write(CSV_DATA.encode()) self.wfile.write(CSV_DATA.encode())
else: else:
self.wfile.write(self.data) self.wfile.write(self.data)
@ -119,9 +134,11 @@ class HttpProcessor(SimpleHTTPRequestHandler):
def log_message(self, format, *args): def log_message(self, format, *args):
return return
class HTTPServerV6(HTTPServer): class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6 address_family = socket.AF_INET6
def start_server(requests_amount): def start_server(requests_amount):
if IS_IPV6: if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor) httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor)
@ -135,52 +152,60 @@ def start_server(requests_amount):
t = threading.Thread(target=real_func) t = threading.Thread(target=real_func)
return t return t
##################################################################### #####################################################################
# Testing area. # Testing area.
##################################################################### #####################################################################
def test_select(dict_name="", schema="word String, counter UInt32", requests=[], answers=[], test_data=""):
def test_select(
dict_name="",
schema="word String, counter UInt32",
requests=[],
answers=[],
test_data="",
):
global ADDING_ENDING global ADDING_ENDING
global SEND_ENCODING global SEND_ENCODING
global COMPRESS_METHOD global COMPRESS_METHOD
for i in range(len(requests)): for i in range(len(requests)):
if i > 2: if i > 2:
ADDING_ENDING = ENDINGS[i-3] ADDING_ENDING = ENDINGS[i - 3]
SEND_ENCODING = False SEND_ENCODING = False
if dict_name: if dict_name:
get_ch_answer("drop dictionary if exists {}".format(dict_name)) get_ch_answer("drop dictionary if exists {}".format(dict_name))
get_ch_answer('''CREATE DICTIONARY {} ({}) get_ch_answer(
"""CREATE DICTIONARY {} ({})
PRIMARY KEY word PRIMARY KEY word
SOURCE(HTTP(url '{}' format 'CSV')) SOURCE(HTTP(url '{}' format 'CSV'))
LAYOUT(complex_key_hashed()) LAYOUT(complex_key_hashed())
LIFETIME(0)'''.format(dict_name, schema, HTTP_SERVER_URL_STR + '/test.csv' + ADDING_ENDING)) LIFETIME(0)""".format(
dict_name, schema, HTTP_SERVER_URL_STR + "/test.csv" + ADDING_ENDING
)
)
COMPRESS_METHOD = requests[i] COMPRESS_METHOD = requests[i]
print(i, COMPRESS_METHOD, ADDING_ENDING, SEND_ENCODING) print(i, COMPRESS_METHOD, ADDING_ENDING, SEND_ENCODING)
check_answers("SELECT * FROM {} ORDER BY word".format(dict_name), answers[i]) check_answers("SELECT * FROM {} ORDER BY word".format(dict_name), answers[i])
def main(): def main():
# first three for encoding, second three for url # first three for encoding, second three for url
insert_requests = [ insert_requests = ["none", "gzip", "lzma", "gzip", "lzma"]
'none',
'gzip',
'lzma',
'gzip',
'lzma'
]
# This answers got experemently in non compressed mode and they are correct # This answers got experemently in non compressed mode and they are correct
answers = ['''Hello 1\nThis 152\nWorld 2\ndata 555\nis 9283\ntesting 2313213'''] * 5 answers = ["""Hello 1\nThis 152\nWorld 2\ndata 555\nis 9283\ntesting 2313213"""] * 5
t = start_server(len(insert_requests)) t = start_server(len(insert_requests))
t.start() t.start()
test_select(dict_name="test_table_select", requests=insert_requests, answers=answers) test_select(
dict_name="test_table_select", requests=insert_requests, answers=answers
)
t.join() t.join()
print("PASSED") print("PASSED")
if __name__ == "__main__": if __name__ == "__main__":
try: try:
main() main()
@ -191,5 +216,3 @@ if __name__ == "__main__":
sys.stderr.flush() sys.stderr.flush()
os._exit(1) os._exit(1)

View File

@ -5,9 +5,10 @@ import socket
import os import os
import uuid import uuid
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1') CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT = int(os.environ.get('CLICKHOUSE_PORT_TCP', '900000')) CLICKHOUSE_PORT = int(os.environ.get("CLICKHOUSE_PORT_TCP", "900000"))
CLICKHOUSE_DATABASE = os.environ.get('CLICKHOUSE_DATABASE', 'default') CLICKHOUSE_DATABASE = os.environ.get("CLICKHOUSE_DATABASE", "default")
def writeVarUInt(x, ba): def writeVarUInt(x, ba):
for _ in range(0, 9): for _ in range(0, 9):
@ -24,12 +25,12 @@ def writeVarUInt(x, ba):
def writeStringBinary(s, ba): def writeStringBinary(s, ba):
b = bytes(s, 'utf-8') b = bytes(s, "utf-8")
writeVarUInt(len(s), ba) writeVarUInt(len(s), ba)
ba.extend(b) ba.extend(b)
def readStrict(s, size = 1): def readStrict(s, size=1):
res = bytearray() res = bytearray()
while size: while size:
cur = s.recv(size) cur = s.recv(size)
@ -48,18 +49,23 @@ def readUInt(s, size=1):
val += res[i] << (i * 8) val += res[i] << (i * 8)
return val return val
def readUInt8(s): def readUInt8(s):
return readUInt(s) return readUInt(s)
def readUInt16(s): def readUInt16(s):
return readUInt(s, 2) return readUInt(s, 2)
def readUInt32(s): def readUInt32(s):
return readUInt(s, 4) return readUInt(s, 4)
def readUInt64(s): def readUInt64(s):
return readUInt(s, 8) return readUInt(s, 8)
def readVarUInt(s): def readVarUInt(s):
x = 0 x = 0
for i in range(9): for i in range(9):
@ -75,25 +81,25 @@ def readVarUInt(s):
def readStringBinary(s): def readStringBinary(s):
size = readVarUInt(s) size = readVarUInt(s)
s = readStrict(s, size) s = readStrict(s, size)
return s.decode('utf-8') return s.decode("utf-8")
def sendHello(s): def sendHello(s):
ba = bytearray() ba = bytearray()
writeVarUInt(0, ba) # Hello writeVarUInt(0, ba) # Hello
writeStringBinary('simple native protocol', ba) writeStringBinary("simple native protocol", ba)
writeVarUInt(21, ba) writeVarUInt(21, ba)
writeVarUInt(9, ba) writeVarUInt(9, ba)
writeVarUInt(54449, ba) writeVarUInt(54449, ba)
writeStringBinary('default', ba) # database writeStringBinary("default", ba) # database
writeStringBinary('default', ba) # user writeStringBinary("default", ba) # user
writeStringBinary('', ba) # pwd writeStringBinary("", ba) # pwd
s.sendall(ba) s.sendall(ba)
def receiveHello(s): def receiveHello(s):
p_type = readVarUInt(s) p_type = readVarUInt(s)
assert (p_type == 0) # Hello assert p_type == 0 # Hello
server_name = readStringBinary(s) server_name = readStringBinary(s)
# print("Server name: ", server_name) # print("Server name: ", server_name)
server_version_major = readVarUInt(s) server_version_major = readVarUInt(s)
@ -111,78 +117,79 @@ def receiveHello(s):
def serializeClientInfo(ba, query_id): def serializeClientInfo(ba, query_id):
writeStringBinary('default', ba) # initial_user writeStringBinary("default", ba) # initial_user
writeStringBinary(query_id, ba) # initial_query_id writeStringBinary(query_id, ba) # initial_query_id
writeStringBinary('127.0.0.1:9000', ba) # initial_address writeStringBinary("127.0.0.1:9000", ba) # initial_address
ba.extend([0] * 8) # initial_query_start_time_microseconds ba.extend([0] * 8) # initial_query_start_time_microseconds
ba.append(1) # TCP ba.append(1) # TCP
writeStringBinary('os_user', ba) # os_user writeStringBinary("os_user", ba) # os_user
writeStringBinary('client_hostname', ba) # client_hostname writeStringBinary("client_hostname", ba) # client_hostname
writeStringBinary('client_name', ba) # client_name writeStringBinary("client_name", ba) # client_name
writeVarUInt(21, ba) writeVarUInt(21, ba)
writeVarUInt(9, ba) writeVarUInt(9, ba)
writeVarUInt(54449, ba) writeVarUInt(54449, ba)
writeStringBinary('', ba) # quota_key writeStringBinary("", ba) # quota_key
writeVarUInt(0, ba) # distributed_depth writeVarUInt(0, ba) # distributed_depth
writeVarUInt(1, ba) # client_version_patch writeVarUInt(1, ba) # client_version_patch
ba.append(0) # No telemetry ba.append(0) # No telemetry
def sendQuery(s, query): def sendQuery(s, query):
ba = bytearray() ba = bytearray()
query_id = uuid.uuid4().hex query_id = uuid.uuid4().hex
writeVarUInt(1, ba) # query writeVarUInt(1, ba) # query
writeStringBinary(query_id, ba) writeStringBinary(query_id, ba)
ba.append(1) # INITIAL_QUERY ba.append(1) # INITIAL_QUERY
# client info # client info
serializeClientInfo(ba, query_id) serializeClientInfo(ba, query_id)
writeStringBinary('', ba) # No settings writeStringBinary("", ba) # No settings
writeStringBinary('', ba) # No interserver secret writeStringBinary("", ba) # No interserver secret
writeVarUInt(2, ba) # Stage - Complete writeVarUInt(2, ba) # Stage - Complete
ba.append(0) # No compression ba.append(0) # No compression
writeStringBinary(query, ba) # query, finally writeStringBinary(query, ba) # query, finally
s.sendall(ba) s.sendall(ba)
def serializeBlockInfo(ba): def serializeBlockInfo(ba):
writeVarUInt(1, ba) # 1 writeVarUInt(1, ba) # 1
ba.append(0) # is_overflows ba.append(0) # is_overflows
writeVarUInt(2, ba) # 2 writeVarUInt(2, ba) # 2
writeVarUInt(0, ba) # 0 writeVarUInt(0, ba) # 0
ba.extend([0] * 4) # bucket_num ba.extend([0] * 4) # bucket_num
def sendEmptyBlock(s): def sendEmptyBlock(s):
ba = bytearray() ba = bytearray()
writeVarUInt(2, ba) # Data writeVarUInt(2, ba) # Data
writeStringBinary('', ba) writeStringBinary("", ba)
serializeBlockInfo(ba) serializeBlockInfo(ba)
writeVarUInt(0, ba) # rows writeVarUInt(0, ba) # rows
writeVarUInt(0, ba) # columns writeVarUInt(0, ba) # columns
s.sendall(ba) s.sendall(ba)
def assertPacket(packet, expected): def assertPacket(packet, expected):
assert(packet == expected), packet assert packet == expected, packet
def readHeader(s): def readHeader(s):
packet_type = readVarUInt(s) packet_type = readVarUInt(s)
if packet_type == 2: # Exception if packet_type == 2: # Exception
raise RuntimeError(readException(s)) raise RuntimeError(readException(s))
assertPacket(packet_type, 1) # Data assertPacket(packet_type, 1) # Data
readStringBinary(s) # external table name readStringBinary(s) # external table name
# BlockInfo # BlockInfo
assertPacket(readVarUInt(s), 1) # 1 assertPacket(readVarUInt(s), 1) # 1
assertPacket(readUInt8(s), 0) # is_overflows assertPacket(readUInt8(s), 0) # is_overflows
assertPacket(readVarUInt(s), 2) # 2 assertPacket(readVarUInt(s), 2) # 2
assertPacket(readUInt32(s), 4294967295) # bucket_num assertPacket(readUInt32(s), 4294967295) # bucket_num
assertPacket(readVarUInt(s), 0) # 0 assertPacket(readVarUInt(s), 0) # 0
columns = readVarUInt(s) # rows columns = readVarUInt(s) # rows
rows = readVarUInt(s) # columns rows = readVarUInt(s) # columns
print("Rows {} Columns {}".format(rows, columns)) print("Rows {} Columns {}".format(rows, columns))
for _ in range(columns): for _ in range(columns):
col_name = readStringBinary(s) col_name = readStringBinary(s)
@ -194,9 +201,9 @@ def readException(s):
code = readUInt32(s) code = readUInt32(s)
name = readStringBinary(s) name = readStringBinary(s)
text = readStringBinary(s) text = readStringBinary(s)
readStringBinary(s) # trace readStringBinary(s) # trace
assertPacket(readUInt8(s), 0) # has_nested assertPacket(readUInt8(s), 0) # has_nested
return "code {}: {}".format(code, text.replace('DB::Exception:', '')) return "code {}: {}".format(code, text.replace("DB::Exception:", ""))
def insertValidLowCardinalityRow(): def insertValidLowCardinalityRow():
@ -205,7 +212,12 @@ def insertValidLowCardinalityRow():
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT)) s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s) sendHello(s)
receiveHello(s) receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE)) sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables # external tables
sendEmptyBlock(s) sendEmptyBlock(s)
@ -213,25 +225,27 @@ def insertValidLowCardinalityRow():
# Data # Data
ba = bytearray() ba = bytearray()
writeVarUInt(2, ba) # Data writeVarUInt(2, ba) # Data
writeStringBinary('', ba) writeStringBinary("", ba)
serializeBlockInfo(ba) serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns writeVarUInt(1, ba) # columns
writeStringBinary('x', ba) writeStringBinary("x", ba)
writeStringBinary('LowCardinality(String)', ba) writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 2] + [0] * 6) # indexes type: UInt64 [3], with additional keys [2] ba.extend(
ba.extend([1] + [0] * 7) # num_keys in dict [3, 2] + [0] * 6
writeStringBinary('hello', ba) # key ) # indexes type: UInt64 [3], with additional keys [2]
ba.extend([1] + [0] * 7) # num_indexes ba.extend([1] + [0] * 7) # num_keys in dict
ba.extend([0] * 8) # UInt64 index (0 for 'hello') writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (0 for 'hello')
s.sendall(ba) s.sendall(ba)
# Fin block # Fin block
sendEmptyBlock(s) sendEmptyBlock(s)
assertPacket(readVarUInt(s), 5) # End of stream assertPacket(readVarUInt(s), 5) # End of stream
s.close() s.close()
@ -241,7 +255,12 @@ def insertLowCardinalityRowWithIndexOverflow():
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT)) s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s) sendHello(s)
receiveHello(s) receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE)) sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables # external tables
sendEmptyBlock(s) sendEmptyBlock(s)
@ -249,19 +268,21 @@ def insertLowCardinalityRowWithIndexOverflow():
# Data # Data
ba = bytearray() ba = bytearray()
writeVarUInt(2, ba) # Data writeVarUInt(2, ba) # Data
writeStringBinary('', ba) writeStringBinary("", ba)
serializeBlockInfo(ba) serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns writeVarUInt(1, ba) # columns
writeStringBinary('x', ba) writeStringBinary("x", ba)
writeStringBinary('LowCardinality(String)', ba) writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 2] + [0] * 6) # indexes type: UInt64 [3], with additional keys [2] ba.extend(
ba.extend([1] + [0] * 7) # num_keys in dict [3, 2] + [0] * 6
writeStringBinary('hello', ba) # key ) # indexes type: UInt64 [3], with additional keys [2]
ba.extend([1] + [0] * 7) # num_indexes ba.extend([1] + [0] * 7) # num_keys in dict
ba.extend([0] * 7 + [1]) # UInt64 index (overflow) writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 7 + [1]) # UInt64 index (overflow)
s.sendall(ba) s.sendall(ba)
assertPacket(readVarUInt(s), 2) assertPacket(readVarUInt(s), 2)
@ -275,7 +296,12 @@ def insertLowCardinalityRowWithIncorrectDictType():
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT)) s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s) sendHello(s)
receiveHello(s) receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE)) sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables # external tables
sendEmptyBlock(s) sendEmptyBlock(s)
@ -283,32 +309,40 @@ def insertLowCardinalityRowWithIncorrectDictType():
# Data # Data
ba = bytearray() ba = bytearray()
writeVarUInt(2, ba) # Data writeVarUInt(2, ba) # Data
writeStringBinary('', ba) writeStringBinary("", ba)
serializeBlockInfo(ba) serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns writeVarUInt(1, ba) # columns
writeStringBinary('x', ba) writeStringBinary("x", ba)
writeStringBinary('LowCardinality(String)', ba) writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 3] + [0] * 6) # indexes type: UInt64 [3], with global dict and add keys [1 + 2] ba.extend(
ba.extend([1] + [0] * 7) # num_keys in dict [3, 3] + [0] * 6
writeStringBinary('hello', ba) # key ) # indexes type: UInt64 [3], with global dict and add keys [1 + 2]
ba.extend([1] + [0] * 7) # num_indexes ba.extend([1] + [0] * 7) # num_keys in dict
ba.extend([0] * 8) # UInt64 index (overflow) writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (overflow)
s.sendall(ba) s.sendall(ba)
assertPacket(readVarUInt(s), 2) assertPacket(readVarUInt(s), 2)
print(readException(s)) print(readException(s))
s.close() s.close()
def insertLowCardinalityRowWithIncorrectAdditionalKeys(): def insertLowCardinalityRowWithIncorrectAdditionalKeys():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(30) s.settimeout(30)
s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT)) s.connect((CLICKHOUSE_HOST, CLICKHOUSE_PORT))
sendHello(s) sendHello(s)
receiveHello(s) receiveHello(s)
sendQuery(s, 'insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV'.format(CLICKHOUSE_DATABASE)) sendQuery(
s,
"insert into {}.tab settings input_format_defaults_for_omitted_fields=0 format TSV".format(
CLICKHOUSE_DATABASE
),
)
# external tables # external tables
sendEmptyBlock(s) sendEmptyBlock(s)
@ -316,30 +350,34 @@ def insertLowCardinalityRowWithIncorrectAdditionalKeys():
# Data # Data
ba = bytearray() ba = bytearray()
writeVarUInt(2, ba) # Data writeVarUInt(2, ba) # Data
writeStringBinary('', ba) writeStringBinary("", ba)
serializeBlockInfo(ba) serializeBlockInfo(ba)
writeVarUInt(1, ba) # rows writeVarUInt(1, ba) # rows
writeVarUInt(1, ba) # columns writeVarUInt(1, ba) # columns
writeStringBinary('x', ba) writeStringBinary("x", ba)
writeStringBinary('LowCardinality(String)', ba) writeStringBinary("LowCardinality(String)", ba)
ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys ba.extend([1] + [0] * 7) # SharedDictionariesWithAdditionalKeys
ba.extend([3, 0] + [0] * 6) # indexes type: UInt64 [3], with NO additional keys [0] ba.extend(
ba.extend([1] + [0] * 7) # num_keys in dict [3, 0] + [0] * 6
writeStringBinary('hello', ba) # key ) # indexes type: UInt64 [3], with NO additional keys [0]
ba.extend([1] + [0] * 7) # num_indexes ba.extend([1] + [0] * 7) # num_keys in dict
ba.extend([0] * 8) # UInt64 index (0 for 'hello') writeStringBinary("hello", ba) # key
ba.extend([1] + [0] * 7) # num_indexes
ba.extend([0] * 8) # UInt64 index (0 for 'hello')
s.sendall(ba) s.sendall(ba)
assertPacket(readVarUInt(s), 2) assertPacket(readVarUInt(s), 2)
print(readException(s)) print(readException(s))
s.close() s.close()
def main(): def main():
insertValidLowCardinalityRow() insertValidLowCardinalityRow()
insertLowCardinalityRowWithIndexOverflow() insertLowCardinalityRowWithIndexOverflow()
insertLowCardinalityRowWithIncorrectDictType() insertLowCardinalityRowWithIncorrectDictType()
insertLowCardinalityRowWithIncorrectAdditionalKeys() insertLowCardinalityRowWithIncorrectAdditionalKeys()
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -12,6 +12,7 @@ import subprocess
from io import StringIO from io import StringIO
from http.server import BaseHTTPRequestHandler, HTTPServer from http.server import BaseHTTPRequestHandler, HTTPServer
def is_ipv6(host): def is_ipv6(host):
try: try:
socket.inet_aton(host) socket.inet_aton(host)
@ -19,6 +20,7 @@ def is_ipv6(host):
except: except:
return True return True
def get_local_port(host, ipv6): def get_local_port(host, ipv6):
if ipv6: if ipv6:
family = socket.AF_INET6 family = socket.AF_INET6
@ -29,8 +31,9 @@ def get_local_port(host, ipv6):
fd.bind((host, 0)) fd.bind((host, 0))
return fd.getsockname()[1] return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
##################################################################################### #####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table. # This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -39,27 +42,42 @@ CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123')
##################################################################################### #####################################################################################
# IP-address of this host accessible from the outside world. Get the first one # IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0] HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST) IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6) HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script. # IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6: if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else: else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
CSV_DATA = os.path.join(
tempfile._get_default_tempdir(), next(tempfile._get_candidate_names())
)
CSV_DATA = os.path.join(tempfile._get_default_tempdir(), next(tempfile._get_candidate_names()))
def get_ch_answer(query): def get_ch_answer(query):
host = CLICKHOUSE_HOST host = CLICKHOUSE_HOST
if IS_IPV6: if IS_IPV6:
host = f'[{host}]' host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP)) url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode() return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer): def check_answers(query, answer):
ch_answer = get_ch_answer(query) ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip(): if ch_answer.strip() != answer.strip():
@ -68,15 +86,16 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr) print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query") raise Exception("Fail on query")
class CSVHTTPServer(BaseHTTPRequestHandler): class CSVHTTPServer(BaseHTTPRequestHandler):
def _set_headers(self): def _set_headers(self):
self.send_response(200) self.send_response(200)
self.send_header('Content-type', 'text/csv') self.send_header("Content-type", "text/csv")
self.end_headers() self.end_headers()
def do_GET(self): def do_GET(self):
self._set_headers() self._set_headers()
self.wfile.write(('hello, world').encode()) self.wfile.write(("hello, world").encode())
# with open(CSV_DATA, 'r') as fl: # with open(CSV_DATA, 'r') as fl:
# reader = csv.reader(fl, delimiter=',') # reader = csv.reader(fl, delimiter=',')
# for row in reader: # for row in reader:
@ -84,33 +103,33 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
return return
def read_chunk(self): def read_chunk(self):
msg = '' msg = ""
while True: while True:
sym = self.rfile.read(1) sym = self.rfile.read(1)
if sym == '': if sym == "":
break break
msg += sym.decode('utf-8') msg += sym.decode("utf-8")
if msg.endswith('\r\n'): if msg.endswith("\r\n"):
break break
length = int(msg[:-2], 16) length = int(msg[:-2], 16)
if length == 0: if length == 0:
return '' return ""
content = self.rfile.read(length) content = self.rfile.read(length)
self.rfile.read(2) # read sep \r\n self.rfile.read(2) # read sep \r\n
return content.decode('utf-8') return content.decode("utf-8")
def do_POST(self): def do_POST(self):
data = '' data = ""
while True: while True:
chunk = self.read_chunk() chunk = self.read_chunk()
if not chunk: if not chunk:
break break
data += chunk data += chunk
with StringIO(data) as fl: with StringIO(data) as fl:
reader = csv.reader(fl, delimiter=',') reader = csv.reader(fl, delimiter=",")
with open(CSV_DATA, 'a') as d: with open(CSV_DATA, "a") as d:
for row in reader: for row in reader:
d.write(','.join(row) + '\n') d.write(",".join(row) + "\n")
self._set_headers() self._set_headers()
self.wfile.write(b"ok") self.wfile.write(b"ok")
@ -121,6 +140,7 @@ class CSVHTTPServer(BaseHTTPRequestHandler):
class HTTPServerV6(HTTPServer): class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6 address_family = socket.AF_INET6
def start_server(): def start_server():
if IS_IPV6: if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, CSVHTTPServer) httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, CSVHTTPServer)
@ -130,57 +150,87 @@ def start_server():
t = threading.Thread(target=httpd.serve_forever) t = threading.Thread(target=httpd.serve_forever)
return t, httpd return t, httpd
# test section # test section
def test_select(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests=[], answers=[], test_data=""):
with open(CSV_DATA, 'w') as f: # clear file def test_select(
f.write('') table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests=[],
answers=[],
test_data="",
):
with open(CSV_DATA, "w") as f: # clear file
f.write("")
if test_data: if test_data:
with open(CSV_DATA, 'w') as f: with open(CSV_DATA, "w") as f:
f.write(test_data + "\n") f.write(test_data + "\n")
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR)) get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for i in range(len(requests)): for i in range(len(requests)):
tbl = table_name tbl = table_name
if not tbl: if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema) tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests[i].format(tbl=tbl), answers[i]) check_answers(requests[i].format(tbl=tbl), answers[i])
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
def test_insert(table_name="", schema="str String,numuint UInt32,numint Int32,double Float64", requests_insert=[], requests_select=[], answers=[]):
with open(CSV_DATA, 'w') as f: # flush test file def test_insert(
f.write('') table_name="",
schema="str String,numuint UInt32,numint Int32,double Float64",
requests_insert=[],
requests_select=[],
answers=[],
):
with open(CSV_DATA, "w") as f: # flush test file
f.write("")
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
get_ch_answer("create table {} ({}) engine=URL('{}', 'CSV')".format(table_name, schema, HTTP_SERVER_URL_STR)) get_ch_answer(
"create table {} ({}) engine=URL('{}', 'CSV')".format(
table_name, schema, HTTP_SERVER_URL_STR
)
)
for req in requests_insert: for req in requests_insert:
tbl = table_name tbl = table_name
if not tbl: if not tbl:
tbl = "table function url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema) tbl = "table function url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
get_ch_answer(req.format(tbl=tbl)) get_ch_answer(req.format(tbl=tbl))
for i in range(len(requests_select)): for i in range(len(requests_select)):
tbl = table_name tbl = table_name
if not tbl: if not tbl:
tbl = "url('{addr}', 'CSV', '{schema}')".format(addr=HTTP_SERVER_URL_STR, schema=schema) tbl = "url('{addr}', 'CSV', '{schema}')".format(
addr=HTTP_SERVER_URL_STR, schema=schema
)
check_answers(requests_select[i].format(tbl=tbl), answers[i]) check_answers(requests_select[i].format(tbl=tbl), answers[i])
if table_name: if table_name:
get_ch_answer("drop table if exists {}".format(table_name)) get_ch_answer("drop table if exists {}".format(table_name))
def test_select_url_engine(requests=[], answers=[], test_data=""): def test_select_url_engine(requests=[], answers=[], test_data=""):
for i in range(len(requests)): for i in range(len(requests)):
check_answers(requests[i], answers[i]) check_answers(requests[i], answers[i])
def main(): def main():
test_data = "Hello,2,-2,7.7\nWorld,2,-5,8.8" test_data = "Hello,2,-2,7.7\nWorld,2,-5,8.8"
""" """
@ -203,19 +253,29 @@ def main():
""" """
if IS_IPV6: if IS_IPV6:
query = "select * from url('http://guest:guest@" + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/', 'RawBLOB', 'a String')" query = (
"select * from url('http://guest:guest@"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/', 'RawBLOB', 'a String')"
)
else: else:
query = "select * from url('http://guest:guest@" + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/', 'RawBLOB', 'a String')" query = (
"select * from url('http://guest:guest@"
+ f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/', 'RawBLOB', 'a String')"
)
select_requests_url_auth = { select_requests_url_auth = {
query : 'hello, world', query: "hello, world",
} }
t, httpd = start_server() t, httpd = start_server()
t.start() t.start()
test_select(requests=list(select_requests_url_auth.keys()), answers=list(select_requests_url_auth.values()), test_data=test_data) test_select(
requests=list(select_requests_url_auth.keys()),
answers=list(select_requests_url_auth.values()),
test_data=test_data,
)
httpd.shutdown() httpd.shutdown()
t.join() t.join()
print("PASSED") print("PASSED")

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import sys import sys
from math import sqrt, nan from math import sqrt, nan
@ -8,7 +8,7 @@ import pandas as pd
import numpy as np import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
@ -25,7 +25,7 @@ def twosample_proportion_ztest(s1, s2, t1, t2, alpha):
return nan, nan, nan, nan return nan, nan, nan, nan
z_stat = (p1 - p2) / se z_stat = (p1 - p2) / se
one_side = 1 - stats.norm.cdf(abs(z_stat)) one_side = 1 - stats.norm.cdf(abs(z_stat))
p_value = one_side * 2 p_value = one_side * 2
z = stats.norm.ppf(1 - 0.5 * alpha) z = stats.norm.ppf(1 - 0.5 * alpha)
@ -38,71 +38,171 @@ def twosample_proportion_ztest(s1, s2, t1, t2, alpha):
def test_and_check(name, z_stat, p_value, ci_lower, ci_upper, precision=1e-2): def test_and_check(name, z_stat, p_value, ci_lower, ci_upper, precision=1e-2):
client = ClickHouseClient() client = ClickHouseClient()
real = client.query_return_df( real = client.query_return_df(
"SELECT roundBankers({}.1, 16) as z_stat, ".format(name) + "SELECT roundBankers({}.1, 16) as z_stat, ".format(name)
"roundBankers({}.2, 16) as p_value, ".format(name) + + "roundBankers({}.2, 16) as p_value, ".format(name)
"roundBankers({}.3, 16) as ci_lower, ".format(name) + + "roundBankers({}.3, 16) as ci_lower, ".format(name)
"roundBankers({}.4, 16) as ci_upper ".format(name) + + "roundBankers({}.4, 16) as ci_upper ".format(name)
"FORMAT TabSeparatedWithNames;") + "FORMAT TabSeparatedWithNames;"
real_z_stat = real['z_stat'][0] )
real_p_value = real['p_value'][0] real_z_stat = real["z_stat"][0]
real_ci_lower = real['ci_lower'][0] real_p_value = real["p_value"][0]
real_ci_upper = real['ci_upper'][0] real_ci_lower = real["ci_lower"][0]
assert((np.isnan(real_z_stat) and np.isnan(z_stat)) or abs(real_z_stat - np.float64(z_stat)) < precision), "clickhouse_z_stat {}, py_z_stat {}".format(real_z_stat, z_stat) real_ci_upper = real["ci_upper"][0]
assert((np.isnan(real_p_value) and np.isnan(p_value)) or abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value) assert (np.isnan(real_z_stat) and np.isnan(z_stat)) or abs(
assert((np.isnan(real_ci_lower) and np.isnan(ci_lower)) or abs(real_ci_lower - np.float64(ci_lower)) < precision), "clickhouse_ci_lower {}, py_ci_lower {}".format(real_ci_lower, ci_lower) real_z_stat - np.float64(z_stat)
assert((np.isnan(real_ci_upper) and np.isnan(ci_upper)) or abs(real_ci_upper - np.float64(ci_upper)) < precision), "clickhouse_ci_upper {}, py_ci_upper {}".format(real_ci_upper, ci_upper) ) < precision, "clickhouse_z_stat {}, py_z_stat {}".format(real_z_stat, z_stat)
assert (np.isnan(real_p_value) and np.isnan(p_value)) or abs(
real_p_value - np.float64(p_value)
) < precision, "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
assert (np.isnan(real_ci_lower) and np.isnan(ci_lower)) or abs(
real_ci_lower - np.float64(ci_lower)
) < precision, "clickhouse_ci_lower {}, py_ci_lower {}".format(
real_ci_lower, ci_lower
)
assert (np.isnan(real_ci_upper) and np.isnan(ci_upper)) or abs(
real_ci_upper - np.float64(ci_upper)
) < precision, "clickhouse_ci_upper {}, py_ci_upper {}".format(
real_ci_upper, ci_upper
)
def test_mean_ztest(): def test_mean_ztest():
counts = [0, 0] counts = [0, 0]
nobs = [0, 0] nobs = [0, 0]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(10, 10, 10, 10, 0.05) )
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
10, 10, 10, 10, 0.05
)
counts = [10, 10] counts = [10, 10]
nobs = [10, 10] nobs = [10, 10]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(10, 10, 10, 10, 0.05) )
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
10, 10, 10, 10, 0.05
)
counts = [16, 16] counts = [16, 16]
nobs = [16, 18] nobs = [16, 18]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [10, 20] counts = [10, 20]
nobs = [30, 40] nobs = [30, 40]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [20, 10] counts = [20, 10]
nobs = [40, 30] nobs = [40, 30]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(10,20), randrange(10,20)] counts = [randrange(10, 20), randrange(10, 20)]
nobs = [randrange(counts[0] + 1, counts[0] * 2), randrange(counts[1], counts[1] * 2)] nobs = [
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) randrange(counts[0] + 1, counts[0] * 2),
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) randrange(counts[1], counts[1] * 2),
]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(1,100), randrange(1,200)] counts = [randrange(1, 100), randrange(1, 200)]
nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 3)] nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 3)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(1,200), randrange(1,100)] counts = [randrange(1, 200), randrange(1, 100)]
nobs = [randrange(counts[0], counts[0] * 3), randrange(counts[1], counts[1] * 2)] nobs = [randrange(counts[0], counts[0] * 3), randrange(counts[1], counts[1] * 2)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
counts = [randrange(1,1000), randrange(1,1000)] counts = [randrange(1, 1000), randrange(1, 1000)]
nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 2)] nobs = [randrange(counts[0], counts[0] * 2), randrange(counts[1], counts[1] * 2)]
z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(counts[0], counts[1], nobs[0], nobs[1], 0.05) z_stat, p_value, ci_lower, ci_upper = twosample_proportion_ztest(
test_and_check("proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')" % (counts[0], counts[1], nobs[0], nobs[1]), z_stat, p_value, ci_lower, ci_upper) counts[0], counts[1], nobs[0], nobs[1], 0.05
)
test_and_check(
"proportionsZTest(%d, %d, %d, %d, 0.95, 'unpooled')"
% (counts[0], counts[1], nobs[0], nobs[1]),
z_stat,
p_value,
ci_lower,
ci_upper,
)
if __name__ == "__main__": if __name__ == "__main__":
test_mean_ztest() test_mean_ztest()
print("Ok.") print("Ok.")

View File

@ -1,4 +1,4 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import os import os
import sys import sys
from statistics import variance from statistics import variance
@ -7,7 +7,7 @@ import pandas as pd
import numpy as np import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
@ -30,46 +30,95 @@ def twosample_mean_ztest(rvs1, rvs2, alpha=0.05):
def test_and_check(name, a, b, t_stat, p_value, ci_low, ci_high, precision=1e-2): def test_and_check(name, a, b, t_stat, p_value, ci_low, ci_high, precision=1e-2):
client = ClickHouseClient() client = ClickHouseClient()
client.query("DROP TABLE IF EXISTS ztest;") client.query("DROP TABLE IF EXISTS ztest;")
client.query("CREATE TABLE ztest (left Float64, right UInt8) ENGINE = Memory;"); client.query("CREATE TABLE ztest (left Float64, right UInt8) ENGINE = Memory;")
client.query("INSERT INTO ztest VALUES {};".format(", ".join(['({},{})'.format(i, 0) for i in a]))) client.query(
client.query("INSERT INTO ztest VALUES {};".format(", ".join(['({},{})'.format(j, 1) for j in b]))) "INSERT INTO ztest VALUES {};".format(
", ".join(["({},{})".format(i, 0) for i in a])
)
)
client.query(
"INSERT INTO ztest VALUES {};".format(
", ".join(["({},{})".format(j, 1) for j in b])
)
)
real = client.query_return_df( real = client.query_return_df(
"SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name) + "SELECT roundBankers({}(left, right).1, 16) as t_stat, ".format(name)
"roundBankers({}(left, right).2, 16) as p_value, ".format(name) + + "roundBankers({}(left, right).2, 16) as p_value, ".format(name)
"roundBankers({}(left, right).3, 16) as ci_low, ".format(name) + + "roundBankers({}(left, right).3, 16) as ci_low, ".format(name)
"roundBankers({}(left, right).4, 16) as ci_high ".format(name) + + "roundBankers({}(left, right).4, 16) as ci_high ".format(name)
"FROM ztest FORMAT TabSeparatedWithNames;") + "FROM ztest FORMAT TabSeparatedWithNames;"
real_t_stat = real['t_stat'][0] )
real_p_value = real['p_value'][0] real_t_stat = real["t_stat"][0]
real_ci_low = real['ci_low'][0] real_p_value = real["p_value"][0]
real_ci_high = real['ci_high'][0] real_ci_low = real["ci_low"][0]
assert(abs(real_t_stat - np.float64(t_stat)) < precision), "clickhouse_t_stat {}, py_t_stat {}".format(real_t_stat, t_stat) real_ci_high = real["ci_high"][0]
assert(abs(real_p_value - np.float64(p_value)) < precision), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value) assert (
assert(abs(real_ci_low - np.float64(ci_low)) < precision), "clickhouse_ci_low {}, py_ci_low {}".format(real_ci_low, ci_low) abs(real_t_stat - np.float64(t_stat)) < precision
assert(abs(real_ci_high - np.float64(ci_high)) < precision), "clickhouse_ci_high {}, py_ci_high {}".format(real_ci_high, ci_high) ), "clickhouse_t_stat {}, py_t_stat {}".format(real_t_stat, t_stat)
assert (
abs(real_p_value - np.float64(p_value)) < precision
), "clickhouse_p_value {}, py_p_value {}".format(real_p_value, p_value)
assert (
abs(real_ci_low - np.float64(ci_low)) < precision
), "clickhouse_ci_low {}, py_ci_low {}".format(real_ci_low, ci_low)
assert (
abs(real_ci_high - np.float64(ci_high)) < precision
), "clickhouse_ci_high {}, py_ci_high {}".format(real_ci_high, ci_high)
client.query("DROP TABLE IF EXISTS ztest;") client.query("DROP TABLE IF EXISTS ztest;")
def test_mean_ztest(): def test_mean_ztest():
rvs1 = np.round(stats.norm.rvs(loc=1, scale=5,size=500), 2) rvs1 = np.round(stats.norm.rvs(loc=1, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=10, scale=5,size=500), 2) rvs2 = np.round(stats.norm.rvs(loc=10, scale=5, size=500), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2) s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch) test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2) rvs1 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=5,size=500), 2) rvs2 = np.round(stats.norm.rvs(loc=0, scale=5, size=500), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2) s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch) test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
rvs1 = np.round(stats.norm.rvs(loc=2, scale=10,size=512), 2) rvs1 = np.round(stats.norm.rvs(loc=2, scale=10, size=512), 2)
rvs2 = np.round(stats.norm.rvs(loc=5, scale=20,size=1024), 2) rvs2 = np.round(stats.norm.rvs(loc=5, scale=20, size=1024), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2) s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch) test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
rvs1 = np.round(stats.norm.rvs(loc=0, scale=10,size=1024), 2) rvs1 = np.round(stats.norm.rvs(loc=0, scale=10, size=1024), 2)
rvs2 = np.round(stats.norm.rvs(loc=0, scale=10,size=512), 2) rvs2 = np.round(stats.norm.rvs(loc=0, scale=10, size=512), 2)
s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2) s, p, cl, ch = twosample_mean_ztest(rvs1, rvs2)
test_and_check("meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)), rvs1, rvs2, s, p, cl, ch) test_and_check(
"meanZTest(%f, %f, 0.95)" % (variance(rvs1), variance(rvs2)),
rvs1,
rvs2,
s,
p,
cl,
ch,
)
if __name__ == "__main__": if __name__ == "__main__":

View File

@ -3,47 +3,71 @@ import os
import sys import sys
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL') CLICKHOUSE_URL = os.environ.get("CLICKHOUSE_URL")
CLICKHOUSE_TMP = os.environ.get('CLICKHOUSE_TMP') CLICKHOUSE_TMP = os.environ.get("CLICKHOUSE_TMP")
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
client = ClickHouseClient() client = ClickHouseClient()
def run_test(data_format, gen_data_template, settings): def run_test(data_format, gen_data_template, settings):
print(data_format) print(data_format)
client.query("TRUNCATE TABLE t_async_insert") client.query("TRUNCATE TABLE t_async_insert")
expected = client.query(gen_data_template.format("TSV")).strip() expected = client.query(gen_data_template.format("TSV")).strip()
data = client.query(gen_data_template.format(data_format), settings=settings,binary_result=True) data = client.query(
gen_data_template.format(data_format), settings=settings, binary_result=True
)
insert_query = "INSERT INTO t_async_insert FORMAT {}".format(data_format) insert_query = "INSERT INTO t_async_insert FORMAT {}".format(data_format)
client.query_with_data(insert_query, data, settings=settings) client.query_with_data(insert_query, data, settings=settings)
result = client.query("SELECT * FROM t_async_insert FORMAT TSV").strip() result = client.query("SELECT * FROM t_async_insert FORMAT TSV").strip()
if result != expected: if result != expected:
print("Failed for format {}.\nExpected:\n{}\nGot:\n{}\n".format(data_format, expected, result)) print(
"Failed for format {}.\nExpected:\n{}\nGot:\n{}\n".format(
data_format, expected, result
)
)
exit(1) exit(1)
formats = client.query("SELECT name FROM system.formats WHERE is_input AND is_output \
AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf', 'ProtobufList') ORDER BY name").strip().split('\n') formats = (
client.query(
"SELECT name FROM system.formats WHERE is_input AND is_output \
AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf', 'ProtobufList') ORDER BY name"
)
.strip()
.split("\n")
)
# Generic formats # Generic formats
client.query("DROP TABLE IF EXISTS t_async_insert") client.query("DROP TABLE IF EXISTS t_async_insert")
client.query("CREATE TABLE t_async_insert (id UInt64, s String, arr Array(UInt64)) ENGINE = Memory") client.query(
"CREATE TABLE t_async_insert (id UInt64, s String, arr Array(UInt64)) ENGINE = Memory"
)
gen_data_query = "SELECT number AS id, toString(number) AS s, range(number) AS arr FROM numbers(10) FORMAT {}" gen_data_query = "SELECT number AS id, toString(number) AS s, range(number) AS arr FROM numbers(10) FORMAT {}"
for data_format in formats: for data_format in formats:
run_test(data_format, gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1}) run_test(
data_format,
gen_data_query,
settings={"async_insert": 1, "wait_for_async_insert": 1},
)
# LineAsString # LineAsString
client.query("DROP TABLE IF EXISTS t_async_insert") client.query("DROP TABLE IF EXISTS t_async_insert")
client.query("CREATE TABLE t_async_insert (s String) ENGINE = Memory") client.query("CREATE TABLE t_async_insert (s String) ENGINE = Memory")
gen_data_query = "SELECT toString(number) AS s FROM numbers(10) FORMAT {}" gen_data_query = "SELECT toString(number) AS s FROM numbers(10) FORMAT {}"
run_test('LineAsString', gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1}) run_test(
"LineAsString",
gen_data_query,
settings={"async_insert": 1, "wait_for_async_insert": 1},
)
# TODO: add CapnProto and Protobuf # TODO: add CapnProto and Protobuf

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
from http.server import SimpleHTTPRequestHandler,HTTPServer from http.server import SimpleHTTPRequestHandler, HTTPServer
import socket import socket
import sys import sys
import threading import threading
@ -17,6 +17,7 @@ def is_ipv6(host):
except: except:
return True return True
def get_local_port(host, ipv6): def get_local_port(host, ipv6):
if ipv6: if ipv6:
family = socket.AF_INET6 family = socket.AF_INET6
@ -27,20 +28,19 @@ def get_local_port(host, ipv6):
fd.bind((host, 0)) fd.bind((host, 0))
return fd.getsockname()[1] return fd.getsockname()[1]
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', 'localhost')
CLICKHOUSE_PORT_HTTP = os.environ.get('CLICKHOUSE_PORT_HTTP', '8123') CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "localhost")
CLICKHOUSE_PORT_HTTP = os.environ.get("CLICKHOUSE_PORT_HTTP", "8123")
# Server returns this JSON response. # Server returns this JSON response.
SERVER_JSON_RESPONSE = \ SERVER_JSON_RESPONSE = """{
'''{
"login": "ClickHouse", "login": "ClickHouse",
"id": 54801242, "id": 54801242,
"name": "ClickHouse", "name": "ClickHouse",
"company": null "company": null
}''' }"""
EXPECTED_ANSWER = \ EXPECTED_ANSWER = """{\\n\\t"login": "ClickHouse",\\n\\t"id": 54801242,\\n\\t"name": "ClickHouse",\\n\\t"company": null\\n}"""
'''{\\n\\t"login": "ClickHouse",\\n\\t"id": 54801242,\\n\\t"name": "ClickHouse",\\n\\t"company": null\\n}'''
##################################################################################### #####################################################################################
# This test starts an HTTP server and serves data to clickhouse url-engine based table. # This test starts an HTTP server and serves data to clickhouse url-engine based table.
@ -51,26 +51,38 @@ EXPECTED_ANSWER = \
##################################################################################### #####################################################################################
# IP-address of this host accessible from the outside world. Get the first one # IP-address of this host accessible from the outside world. Get the first one
HTTP_SERVER_HOST = subprocess.check_output(['hostname', '-i']).decode('utf-8').strip().split()[0] HTTP_SERVER_HOST = (
subprocess.check_output(["hostname", "-i"]).decode("utf-8").strip().split()[0]
)
IS_IPV6 = is_ipv6(HTTP_SERVER_HOST) IS_IPV6 = is_ipv6(HTTP_SERVER_HOST)
HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6) HTTP_SERVER_PORT = get_local_port(HTTP_SERVER_HOST, IS_IPV6)
# IP address and port of the HTTP server started from this script. # IP address and port of the HTTP server started from this script.
HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT) HTTP_SERVER_ADDRESS = (HTTP_SERVER_HOST, HTTP_SERVER_PORT)
if IS_IPV6: if IS_IPV6:
HTTP_SERVER_URL_STR = 'http://' + f'[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://"
+ f"[{str(HTTP_SERVER_ADDRESS[0])}]:{str(HTTP_SERVER_ADDRESS[1])}"
+ "/"
)
else: else:
HTTP_SERVER_URL_STR = 'http://' + f'{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}' + "/" HTTP_SERVER_URL_STR = (
"http://" + f"{str(HTTP_SERVER_ADDRESS[0])}:{str(HTTP_SERVER_ADDRESS[1])}" + "/"
)
def get_ch_answer(query): def get_ch_answer(query):
host = CLICKHOUSE_HOST host = CLICKHOUSE_HOST
if IS_IPV6: if IS_IPV6:
host = f'[{host}]' host = f"[{host}]"
url = os.environ.get('CLICKHOUSE_URL', 'http://{host}:{port}'.format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP)) url = os.environ.get(
"CLICKHOUSE_URL",
"http://{host}:{port}".format(host=CLICKHOUSE_HOST, port=CLICKHOUSE_PORT_HTTP),
)
return urllib.request.urlopen(url, data=query.encode()).read().decode() return urllib.request.urlopen(url, data=query.encode()).read().decode()
def check_answers(query, answer): def check_answers(query, answer):
ch_answer = get_ch_answer(query) ch_answer = get_ch_answer(query)
if ch_answer.strip() != answer.strip(): if ch_answer.strip() != answer.strip():
@ -79,16 +91,17 @@ def check_answers(query, answer):
print("Fetched answer :", ch_answer, file=sys.stderr) print("Fetched answer :", ch_answer, file=sys.stderr)
raise Exception("Fail on query") raise Exception("Fail on query")
# Server with check for User-Agent headers. # Server with check for User-Agent headers.
class HttpProcessor(SimpleHTTPRequestHandler): class HttpProcessor(SimpleHTTPRequestHandler):
def _set_headers(self): def _set_headers(self):
user_agent = self.headers.get('User-Agent') user_agent = self.headers.get("User-Agent")
if user_agent and user_agent.startswith('ClickHouse/'): if user_agent and user_agent.startswith("ClickHouse/"):
self.send_response(200) self.send_response(200)
else: else:
self.send_response(403) self.send_response(403)
self.send_header('Content-Type', 'text/csv') self.send_header("Content-Type", "text/csv")
self.end_headers() self.end_headers()
def do_GET(self): def do_GET(self):
@ -98,9 +111,11 @@ class HttpProcessor(SimpleHTTPRequestHandler):
def log_message(self, format, *args): def log_message(self, format, *args):
return return
class HTTPServerV6(HTTPServer): class HTTPServerV6(HTTPServer):
address_family = socket.AF_INET6 address_family = socket.AF_INET6
def start_server(requests_amount): def start_server(requests_amount):
if IS_IPV6: if IS_IPV6:
httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor) httpd = HTTPServerV6(HTTP_SERVER_ADDRESS, HttpProcessor)
@ -114,15 +129,18 @@ def start_server(requests_amount):
t = threading.Thread(target=real_func) t = threading.Thread(target=real_func)
return t return t
##################################################################### #####################################################################
# Testing area. # Testing area.
##################################################################### #####################################################################
def test_select(): def test_select():
global HTTP_SERVER_URL_STR global HTTP_SERVER_URL_STR
query = 'SELECT * FROM url(\'{}\',\'JSONAsString\');'.format(HTTP_SERVER_URL_STR) query = "SELECT * FROM url('{}','JSONAsString');".format(HTTP_SERVER_URL_STR)
check_answers(query, EXPECTED_ANSWER) check_answers(query, EXPECTED_ANSWER)
def main(): def main():
# HEAD + GET # HEAD + GET
t = start_server(3) t = start_server(3)
@ -131,6 +149,7 @@ def main():
t.join() t.join()
print("PASSED") print("PASSED")
if __name__ == "__main__": if __name__ == "__main__":
try: try:
main() main()
@ -141,4 +160,3 @@ if __name__ == "__main__":
sys.stderr.flush() sys.stderr.flush()
os._exit(1) os._exit(1)

View File

@ -122,7 +122,7 @@ class HttpProcessor(BaseHTTPRequestHandler):
get_call_num = 0 get_call_num = 0
responses_to_get = [] responses_to_get = []
def send_head(self, from_get = False): def send_head(self, from_get=False):
if self.headers["Range"] and HttpProcessor.allow_range: if self.headers["Range"] and HttpProcessor.allow_range:
try: try:
self.range = parse_byte_range(self.headers["Range"]) self.range = parse_byte_range(self.headers["Range"])
@ -146,7 +146,9 @@ class HttpProcessor(BaseHTTPRequestHandler):
self.send_error(416, "Requested Range Not Satisfiable") self.send_error(416, "Requested Range Not Satisfiable")
return None return None
retry_range_request = first != 0 and from_get is True and len(HttpProcessor.responses_to_get) > 0 retry_range_request = (
first != 0 and from_get is True and len(HttpProcessor.responses_to_get) > 0
)
if retry_range_request: if retry_range_request:
code = HttpProcessor.responses_to_get.pop() code = HttpProcessor.responses_to_get.pop()
if code not in HttpProcessor.responses: if code not in HttpProcessor.responses:
@ -244,7 +246,9 @@ def run_test(allow_range, settings, check_retries=False):
raise Exception("HTTP Range was not used when supported") raise Exception("HTTP Range was not used when supported")
if check_retries and len(HttpProcessor.responses_to_get) > 0: if check_retries and len(HttpProcessor.responses_to_get) > 0:
raise Exception("Expected to get http response 500, which had to be retried, but 200 ok returned and then retried") raise Exception(
"Expected to get http response 500, which had to be retried, but 200 ok returned and then retried"
)
if retries_num > 0: if retries_num > 0:
expected_get_call_num += retries_num - 1 expected_get_call_num += retries_num - 1
@ -263,7 +267,7 @@ def run_test(allow_range, settings, check_retries=False):
def main(): def main():
settings = {"max_download_buffer_size" : 20} settings = {"max_download_buffer_size": 20}
# Test Accept-Ranges=False # Test Accept-Ranges=False
run_test(allow_range=False, settings=settings) run_test(allow_range=False, settings=settings)
@ -271,7 +275,7 @@ def main():
run_test(allow_range=True, settings=settings) run_test(allow_range=True, settings=settings)
# Test Accept-Ranges=True, parallel download is used # Test Accept-Ranges=True, parallel download is used
settings = {"max_download_buffer_size" : 10} settings = {"max_download_buffer_size": 10}
run_test(allow_range=True, settings=settings) run_test(allow_range=True, settings=settings)
# Test Accept-Ranges=True, parallel download is not used, # Test Accept-Ranges=True, parallel download is not used,

View File

@ -7,7 +7,7 @@ import pandas as pd
import numpy as np import numpy as np
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
@ -22,15 +22,22 @@ def test_and_check(rvs, n_groups, f_stat, p_value, precision=1e-2):
client.query("DROP TABLE IF EXISTS anova;") client.query("DROP TABLE IF EXISTS anova;")
client.query("CREATE TABLE anova (left Float64, right UInt64) ENGINE = Memory;") client.query("CREATE TABLE anova (left Float64, right UInt64) ENGINE = Memory;")
for group in range(n_groups): for group in range(n_groups):
client.query(f'''INSERT INTO anova VALUES {", ".join([f'({i},{group})' for i in rvs[group]])};''') client.query(
f"""INSERT INTO anova VALUES {", ".join([f'({i},{group})' for i in rvs[group]])};"""
)
real = client.query_return_df( real = client.query_return_df(
'''SELECT roundBankers(a.1, 16) as f_stat, roundBankers(a.2, 16) as p_value FROM (SELECT anova(left, right) as a FROM anova) FORMAT TabSeparatedWithNames;''') """SELECT roundBankers(a.1, 16) as f_stat, roundBankers(a.2, 16) as p_value FROM (SELECT anova(left, right) as a FROM anova) FORMAT TabSeparatedWithNames;"""
)
real_f_stat = real['f_stat'][0] real_f_stat = real["f_stat"][0]
real_p_value = real['p_value'][0] real_p_value = real["p_value"][0]
assert(abs(real_f_stat - np.float64(f_stat)) < precision), f"clickhouse_f_stat {real_f_stat}, py_f_stat {f_stat}" assert (
assert(abs(real_p_value - np.float64(p_value)) < precision), f"clickhouse_p_value {real_p_value}, py_p_value {p_value}" abs(real_f_stat - np.float64(f_stat)) < precision
), f"clickhouse_f_stat {real_f_stat}, py_f_stat {f_stat}"
assert (
abs(real_p_value - np.float64(p_value)) < precision
), f"clickhouse_p_value {real_p_value}, py_p_value {p_value}"
client.query("DROP TABLE IF EXISTS anova;") client.query("DROP TABLE IF EXISTS anova;")

View File

@ -123,10 +123,14 @@ Uses FinishSortingTransform: {}
for query in queries: for query in queries:
check_query(query["where"], query["order_by"], query["optimize"], False) check_query(query["where"], query["order_by"], query["optimize"], False)
check_query(query["where"], query["order_by"] + ["e"], query["optimize"], query["optimize"]) check_query(
query["where"], query["order_by"] + ["e"], query["optimize"], query["optimize"]
)
where_columns = [f"bitNot({col})" for col in query["where"]] where_columns = [f"bitNot({col})" for col in query["where"]]
check_query(where_columns, query["order_by"], query["optimize"], False) check_query(where_columns, query["order_by"], query["optimize"], False)
check_query(where_columns, query["order_by"] + ["e"], query["optimize"], query["optimize"]) check_query(
where_columns, query["order_by"] + ["e"], query["optimize"], query["optimize"]
)
print("OK") print("OK")

View File

@ -8,8 +8,8 @@ TRANSFER_ENCODING_HEADER = "Transfer-Encoding"
def main(): def main():
host = os.environ['CLICKHOUSE_HOST'] host = os.environ["CLICKHOUSE_HOST"]
port = int(os.environ['CLICKHOUSE_PORT_HTTP']) port = int(os.environ["CLICKHOUSE_PORT_HTTP"])
sock = socket(AF_INET, SOCK_STREAM) sock = socket(AF_INET, SOCK_STREAM)
sock.connect((host, port)) sock.connect((host, port))
@ -47,4 +47,3 @@ def main():
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -5,9 +5,10 @@ import os
import uuid import uuid
import json import json
CLICKHOUSE_HOST = os.environ.get('CLICKHOUSE_HOST', '127.0.0.1') CLICKHOUSE_HOST = os.environ.get("CLICKHOUSE_HOST", "127.0.0.1")
CLICKHOUSE_PORT = int(os.environ.get('CLICKHOUSE_PORT_TCP', '900000')) CLICKHOUSE_PORT = int(os.environ.get("CLICKHOUSE_PORT_TCP", "900000"))
CLICKHOUSE_DATABASE = os.environ.get('CLICKHOUSE_DATABASE', 'default') CLICKHOUSE_DATABASE = os.environ.get("CLICKHOUSE_DATABASE", "default")
def writeVarUInt(x, ba): def writeVarUInt(x, ba):
for _ in range(0, 9): for _ in range(0, 9):
@ -24,12 +25,12 @@ def writeVarUInt(x, ba):
def writeStringBinary(s, ba): def writeStringBinary(s, ba):
b = bytes(s, 'utf-8') b = bytes(s, "utf-8")
writeVarUInt(len(s), ba) writeVarUInt(len(s), ba)
ba.extend(b) ba.extend(b)
def readStrict(s, size = 1): def readStrict(s, size=1):
res = bytearray() res = bytearray()
while size: while size:
cur = s.recv(size) cur = s.recv(size)
@ -48,18 +49,23 @@ def readUInt(s, size=1):
val += res[i] << (i * 8) val += res[i] << (i * 8)
return val return val
def readUInt8(s): def readUInt8(s):
return readUInt(s) return readUInt(s)
def readUInt16(s): def readUInt16(s):
return readUInt(s, 2) return readUInt(s, 2)
def readUInt32(s): def readUInt32(s):
return readUInt(s, 4) return readUInt(s, 4)
def readUInt64(s): def readUInt64(s):
return readUInt(s, 8) return readUInt(s, 8)
def readVarUInt(s): def readVarUInt(s):
x = 0 x = 0
for i in range(9): for i in range(9):
@ -75,25 +81,25 @@ def readVarUInt(s):
def readStringBinary(s): def readStringBinary(s):
size = readVarUInt(s) size = readVarUInt(s)
s = readStrict(s, size) s = readStrict(s, size)
return s.decode('utf-8') return s.decode("utf-8")
def sendHello(s): def sendHello(s):
ba = bytearray() ba = bytearray()
writeVarUInt(0, ba) # Hello writeVarUInt(0, ba) # Hello
writeStringBinary('simple native protocol', ba) writeStringBinary("simple native protocol", ba)
writeVarUInt(21, ba) writeVarUInt(21, ba)
writeVarUInt(9, ba) writeVarUInt(9, ba)
writeVarUInt(54449, ba) writeVarUInt(54449, ba)
writeStringBinary(CLICKHOUSE_DATABASE, ba) # database writeStringBinary(CLICKHOUSE_DATABASE, ba) # database
writeStringBinary('default', ba) # user writeStringBinary("default", ba) # user
writeStringBinary('', ba) # pwd writeStringBinary("", ba) # pwd
s.sendall(ba) s.sendall(ba)
def receiveHello(s): def receiveHello(s):
p_type = readVarUInt(s) p_type = readVarUInt(s)
assert (p_type == 0) # Hello assert p_type == 0 # Hello
server_name = readStringBinary(s) server_name = readStringBinary(s)
# print("Server name: ", server_name) # print("Server name: ", server_name)
server_version_major = readVarUInt(s) server_version_major = readVarUInt(s)
@ -111,65 +117,65 @@ def receiveHello(s):
def serializeClientInfo(ba, query_id): def serializeClientInfo(ba, query_id):
writeStringBinary('default', ba) # initial_user writeStringBinary("default", ba) # initial_user
writeStringBinary(query_id, ba) # initial_query_id writeStringBinary(query_id, ba) # initial_query_id
writeStringBinary('127.0.0.1:9000', ba) # initial_address writeStringBinary("127.0.0.1:9000", ba) # initial_address
ba.extend([0] * 8) # initial_query_start_time_microseconds ba.extend([0] * 8) # initial_query_start_time_microseconds
ba.append(1) # TCP ba.append(1) # TCP
writeStringBinary('os_user', ba) # os_user writeStringBinary("os_user", ba) # os_user
writeStringBinary('client_hostname', ba) # client_hostname writeStringBinary("client_hostname", ba) # client_hostname
writeStringBinary('client_name', ba) # client_name writeStringBinary("client_name", ba) # client_name
writeVarUInt(21, ba) writeVarUInt(21, ba)
writeVarUInt(9, ba) writeVarUInt(9, ba)
writeVarUInt(54449, ba) writeVarUInt(54449, ba)
writeStringBinary('', ba) # quota_key writeStringBinary("", ba) # quota_key
writeVarUInt(0, ba) # distributed_depth writeVarUInt(0, ba) # distributed_depth
writeVarUInt(1, ba) # client_version_patch writeVarUInt(1, ba) # client_version_patch
ba.append(0) # No telemetry ba.append(0) # No telemetry
def sendQuery(s, query): def sendQuery(s, query):
ba = bytearray() ba = bytearray()
query_id = uuid.uuid4().hex query_id = uuid.uuid4().hex
writeVarUInt(1, ba) # query writeVarUInt(1, ba) # query
writeStringBinary(query_id, ba) writeStringBinary(query_id, ba)
ba.append(1) # INITIAL_QUERY ba.append(1) # INITIAL_QUERY
# client info # client info
serializeClientInfo(ba, query_id) serializeClientInfo(ba, query_id)
writeStringBinary('', ba) # No settings writeStringBinary("", ba) # No settings
writeStringBinary('', ba) # No interserver secret writeStringBinary("", ba) # No interserver secret
writeVarUInt(2, ba) # Stage - Complete writeVarUInt(2, ba) # Stage - Complete
ba.append(0) # No compression ba.append(0) # No compression
writeStringBinary(query, ba) # query, finally writeStringBinary(query, ba) # query, finally
s.sendall(ba) s.sendall(ba)
def serializeBlockInfo(ba): def serializeBlockInfo(ba):
writeVarUInt(1, ba) # 1 writeVarUInt(1, ba) # 1
ba.append(0) # is_overflows ba.append(0) # is_overflows
writeVarUInt(2, ba) # 2 writeVarUInt(2, ba) # 2
writeVarUInt(0, ba) # 0 writeVarUInt(0, ba) # 0
ba.extend([0] * 4) # bucket_num ba.extend([0] * 4) # bucket_num
def sendEmptyBlock(s): def sendEmptyBlock(s):
ba = bytearray() ba = bytearray()
writeVarUInt(2, ba) # Data writeVarUInt(2, ba) # Data
writeStringBinary('', ba) writeStringBinary("", ba)
serializeBlockInfo(ba) serializeBlockInfo(ba)
writeVarUInt(0, ba) # rows writeVarUInt(0, ba) # rows
writeVarUInt(0, ba) # columns writeVarUInt(0, ba) # columns
s.sendall(ba) s.sendall(ba)
def assertPacket(packet, expected): def assertPacket(packet, expected):
assert(packet == expected), packet assert packet == expected, packet
class Progress(): class Progress:
def __init__(self): def __init__(self):
# NOTE: this is done in ctor to initialize __dict__ # NOTE: this is done in ctor to initialize __dict__
self.read_rows = 0 self.read_rows = 0
@ -198,11 +204,12 @@ class Progress():
def __bool__(self): def __bool__(self):
return ( return (
self.read_rows > 0 or self.read_rows > 0
self.read_bytes > 0 or or self.read_bytes > 0
self.total_rows_to_read > 0 or or self.total_rows_to_read > 0
self.written_rows > 0 or or self.written_rows > 0
self.written_bytes > 0) or self.written_bytes > 0
)
def readProgress(s): def readProgress(s):
@ -219,13 +226,14 @@ def readProgress(s):
progress.readPacket(s) progress.readPacket(s)
return progress return progress
def readException(s): def readException(s):
code = readUInt32(s) code = readUInt32(s)
name = readStringBinary(s) name = readStringBinary(s)
text = readStringBinary(s) text = readStringBinary(s)
readStringBinary(s) # trace readStringBinary(s) # trace
assertPacket(readUInt8(s), 0) # has_nested assertPacket(readUInt8(s), 0) # has_nested
return "code {}: {}".format(code, text.replace('DB::Exception:', '')) return "code {}: {}".format(code, text.replace("DB::Exception:", ""))
def main(): def main():
@ -236,7 +244,10 @@ def main():
receiveHello(s) receiveHello(s)
# For 1 second sleep and 1000ms of interactive_delay we definitelly should have non zero progress packet. # For 1 second sleep and 1000ms of interactive_delay we definitelly should have non zero progress packet.
# NOTE: interactive_delay=0 cannot be used since in this case CompletedPipelineExecutor will not call cancelled callback. # NOTE: interactive_delay=0 cannot be used since in this case CompletedPipelineExecutor will not call cancelled callback.
sendQuery(s, "insert into function null('_ Int') select sleep(1) from numbers(2) settings max_block_size=1, interactive_delay=1000") sendQuery(
s,
"insert into function null('_ Int') select sleep(1) from numbers(2) settings max_block_size=1, interactive_delay=1000",
)
# external tables # external tables
sendEmptyBlock(s) sendEmptyBlock(s)

View File

@ -4,18 +4,19 @@ import os
import sys import sys
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
class Tester: class Tester:
''' """
- Creates test table - Creates test table
- Deletes the specified range of rows - Deletes the specified range of rows
- Masks another range using row-level policy - Masks another range using row-level policy
- Runs some read queries and checks that the results - Runs some read queries and checks that the results
''' """
def __init__(self, session, url, index_granularity, total_rows): def __init__(self, session, url, index_granularity, total_rows):
self.session = session self.session = session
self.url = url self.url = url
@ -25,10 +26,10 @@ class Tester:
self.repro_queries = [] self.repro_queries = []
def report_error(self): def report_error(self):
print('Repro steps:', '\n\n\t'.join(self.repro_queries)) print("Repro steps:", "\n\n\t".join(self.repro_queries))
exit(1) exit(1)
def query(self, query_text, include_in_repro_steps = True, expected_data = None): def query(self, query_text, include_in_repro_steps=True, expected_data=None):
self.repro_queries.append(query_text) self.repro_queries.append(query_text)
resp = self.session.post(self.url, data=query_text) resp = self.session.post(self.url, data=query_text)
if resp.status_code != 200: if resp.status_code != 200:
@ -36,113 +37,187 @@ class Tester:
error = resp.text[0:40] error = resp.text[0:40]
if error not in self.reported_errors: if error not in self.reported_errors:
self.reported_errors.add(error) self.reported_errors.add(error)
print('Code:', resp.status_code) print("Code:", resp.status_code)
print('Result:', resp.text) print("Result:", resp.text)
self.report_error() self.report_error()
result = resp.text result = resp.text
# Check that the result is as expected # Check that the result is as expected
if ((not expected_data is None) and (int(result) != len(expected_data))): if (not expected_data is None) and (int(result) != len(expected_data)):
print('Expected {} rows, got {}'.format(len(expected_data), result)) print("Expected {} rows, got {}".format(len(expected_data), result))
print('Expected data:' + str(expected_data)) print("Expected data:" + str(expected_data))
self.report_error() self.report_error()
if not include_in_repro_steps: if not include_in_repro_steps:
self.repro_queries.pop() self.repro_queries.pop()
def check_data(
def check_data(self, all_data, delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end): self,
all_data,
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
):
all_data_after_delete = all_data[ all_data_after_delete = all_data[
~((all_data.a == 0) & ~(
(all_data.b > delete_range_start) & (all_data.a == 0)
(all_data.b <= delete_range_end))] & (all_data.b > delete_range_start)
& (all_data.b <= delete_range_end)
)
]
all_data_after_row_policy = all_data_after_delete[ all_data_after_row_policy = all_data_after_delete[
(all_data_after_delete.b <= row_level_policy_range_start) | (all_data_after_delete.b <= row_level_policy_range_start)
(all_data_after_delete.b > row_level_policy_range_end)] | (all_data_after_delete.b > row_level_policy_range_end)
]
for to_select in ['count()', 'sum(d)']: # Test reading with and without column with default value for to_select in [
self.query('SELECT {} FROM tab_02473;'.format(to_select), False, all_data_after_row_policy) "count()",
"sum(d)",
]: # Test reading with and without column with default value
self.query(
"SELECT {} FROM tab_02473;".format(to_select),
False,
all_data_after_row_policy,
)
delta = 10 delta = 10
for query_range_start in [0, delta]: for query_range_start in [0, delta]:
for query_range_end in [self.total_rows - delta]: #, self.total_rows]: for query_range_end in [self.total_rows - delta]: # , self.total_rows]:
expected = all_data_after_row_policy[ expected = all_data_after_row_policy[
(all_data_after_row_policy.a == 0) & (all_data_after_row_policy.a == 0)
(all_data_after_row_policy.b > query_range_start) & & (all_data_after_row_policy.b > query_range_start)
(all_data_after_row_policy.b <= query_range_end)] & (all_data_after_row_policy.b <= query_range_end)
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;'.format( ]
to_select, query_range_start, query_range_end), False, expected) self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;".format(
to_select, query_range_start, query_range_end
),
False,
expected,
)
expected = all_data_after_row_policy[ expected = all_data_after_row_policy[
(all_data_after_row_policy.a == 0) & (all_data_after_row_policy.a == 0)
(all_data_after_row_policy.c > query_range_start) & & (all_data_after_row_policy.c > query_range_start)
(all_data_after_row_policy.c <= query_range_end)] & (all_data_after_row_policy.c <= query_range_end)
self.query('SELECT {} from tab_02473 PREWHERE c > {} AND c <= {} WHERE a == 0;'.format( ]
to_select, query_range_start, query_range_end), False, expected) self.query(
"SELECT {} from tab_02473 PREWHERE c > {} AND c <= {} WHERE a == 0;".format(
to_select, query_range_start, query_range_end
),
False,
expected,
)
expected = all_data_after_row_policy[ expected = all_data_after_row_policy[
(all_data_after_row_policy.a == 0) & (all_data_after_row_policy.a == 0)
((all_data_after_row_policy.c <= query_range_start) | & (
(all_data_after_row_policy.c > query_range_end))] (all_data_after_row_policy.c <= query_range_start)
self.query('SELECT {} from tab_02473 PREWHERE c <= {} OR c > {} WHERE a == 0;'.format( | (all_data_after_row_policy.c > query_range_end)
to_select, query_range_start, query_range_end), False, expected) )
]
self.query(
"SELECT {} from tab_02473 PREWHERE c <= {} OR c > {} WHERE a == 0;".format(
to_select, query_range_start, query_range_end
),
False,
expected,
)
def run_test(
def run_test(self, delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end): self,
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
):
self.repro_queries = [] self.repro_queries = []
self.query(''' self.query(
"""
CREATE TABLE tab_02473 (a Int8, b Int32, c Int32, PRIMARY KEY (a)) CREATE TABLE tab_02473 (a Int8, b Int32, c Int32, PRIMARY KEY (a))
ENGINE = MergeTree() ORDER BY (a, b) ENGINE = MergeTree() ORDER BY (a, b)
SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};'''.format(self.index_granularity)) SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};""".format(
self.index_granularity
)
)
self.query('INSERT INTO tab_02473 select 0, number+1, number+1 FROM numbers({});'.format(self.total_rows)) self.query(
"INSERT INTO tab_02473 select 0, number+1, number+1 FROM numbers({});".format(
self.total_rows
)
)
client = ClickHouseClient() client = ClickHouseClient()
all_data = client.query_return_df("SELECT a, b, c, 1 as d FROM tab_02473 FORMAT TabSeparatedWithNames;") all_data = client.query_return_df(
"SELECT a, b, c, 1 as d FROM tab_02473 FORMAT TabSeparatedWithNames;"
)
self.query('OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;') self.query("OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;")
# After all data has been written add a column with default value # After all data has been written add a column with default value
self.query('ALTER TABLE tab_02473 ADD COLUMN d Int64 DEFAULT 1;') self.query("ALTER TABLE tab_02473 ADD COLUMN d Int64 DEFAULT 1;")
self.check_data(all_data, -100, -100, -100, -100) self.check_data(all_data, -100, -100, -100, -100)
self.query('DELETE FROM tab_02473 WHERE a = 0 AND b > {} AND b <= {};'.format( self.query(
delete_range_start, delete_range_end)) "DELETE FROM tab_02473 WHERE a = 0 AND b > {} AND b <= {};".format(
delete_range_start, delete_range_end
)
)
self.check_data(all_data, delete_range_start, delete_range_end, -100, -100) self.check_data(all_data, delete_range_start, delete_range_end, -100, -100)
self.query('CREATE ROW POLICY policy_tab_02473 ON tab_02473 FOR SELECT USING b <= {} OR b > {} TO default;'.format( self.query(
row_level_policy_range_start, row_level_policy_range_end)) "CREATE ROW POLICY policy_tab_02473 ON tab_02473 FOR SELECT USING b <= {} OR b > {} TO default;".format(
row_level_policy_range_start, row_level_policy_range_end
)
)
self.check_data(all_data, delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end) self.check_data(
all_data,
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
)
self.query('DROP POLICY policy_tab_02473 ON tab_02473;') self.query("DROP POLICY policy_tab_02473 ON tab_02473;")
self.query('DROP TABLE tab_02473;')
self.query("DROP TABLE tab_02473;")
def main(): def main():
# Set mutations to synchronous mode and enable lightweight DELETE's # Set mutations to synchronous mode and enable lightweight DELETE's
url = os.environ['CLICKHOUSE_URL'] + '&max_threads=1' url = os.environ["CLICKHOUSE_URL"] + "&max_threads=1"
default_index_granularity = 10; default_index_granularity = 10
total_rows = 8 * default_index_granularity total_rows = 8 * default_index_granularity
step = default_index_granularity step = default_index_granularity
session = requests.Session() session = requests.Session()
for index_granularity in [default_index_granularity-1, default_index_granularity]: # [default_index_granularity-1, default_index_granularity+1, default_index_granularity]: for index_granularity in [
default_index_granularity - 1,
default_index_granularity,
]: # [default_index_granularity-1, default_index_granularity+1, default_index_granularity]:
tester = Tester(session, url, index_granularity, total_rows) tester = Tester(session, url, index_granularity, total_rows)
# Test combinations of ranges of various size masked by lightweight DELETES # Test combinations of ranges of various size masked by lightweight DELETES
# along with ranges of various size masked by row-level policies # along with ranges of various size masked by row-level policies
for delete_range_start in range(0, total_rows, 3 * step): for delete_range_start in range(0, total_rows, 3 * step):
for delete_range_end in range(delete_range_start + 3 * step, total_rows, 2 * step): for delete_range_end in range(
delete_range_start + 3 * step, total_rows, 2 * step
):
for row_level_policy_range_start in range(0, total_rows, 3 * step): for row_level_policy_range_start in range(0, total_rows, 3 * step):
for row_level_policy_range_end in range(row_level_policy_range_start + 3 * step, total_rows, 2 * step): for row_level_policy_range_end in range(
tester.run_test(delete_range_start, delete_range_end, row_level_policy_range_start, row_level_policy_range_end) row_level_policy_range_start + 3 * step, total_rows, 2 * step
):
tester.run_test(
delete_range_start,
delete_range_end,
row_level_policy_range_start,
row_level_policy_range_end,
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -4,16 +4,17 @@ import os
import sys import sys
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
class Tester: class Tester:
''' """
- Creates test table with multiple integer columns - Creates test table with multiple integer columns
- Runs read queries with multiple range conditions on different columns in PREWHERE and check that the result is correct - Runs read queries with multiple range conditions on different columns in PREWHERE and check that the result is correct
''' """
def __init__(self, session, url, index_granularity, total_rows): def __init__(self, session, url, index_granularity, total_rows):
self.session = session self.session = session
self.url = url self.url = url
@ -23,10 +24,10 @@ class Tester:
self.repro_queries = [] self.repro_queries = []
def report_error(self): def report_error(self):
print('Repro steps:', '\n\n\t'.join(self.repro_queries)) print("Repro steps:", "\n\n\t".join(self.repro_queries))
exit(1) exit(1)
def query(self, query_text, include_in_repro_steps = True, expected_data = None): def query(self, query_text, include_in_repro_steps=True, expected_data=None):
self.repro_queries.append(query_text) self.repro_queries.append(query_text)
resp = self.session.post(self.url, data=query_text) resp = self.session.post(self.url, data=query_text)
if resp.status_code != 200: if resp.status_code != 200:
@ -34,98 +35,150 @@ class Tester:
error = resp.text[0:40] error = resp.text[0:40]
if error not in self.reported_errors: if error not in self.reported_errors:
self.reported_errors.add(error) self.reported_errors.add(error)
print('Code:', resp.status_code) print("Code:", resp.status_code)
print('Result:', resp.text) print("Result:", resp.text)
self.report_error() self.report_error()
result = resp.text result = resp.text
# Check that the result is as expected # Check that the result is as expected
if ((not expected_data is None) and (int(result) != len(expected_data))): if (not expected_data is None) and (int(result) != len(expected_data)):
print('Expected {} rows, got {}'.format(len(expected_data), result)) print("Expected {} rows, got {}".format(len(expected_data), result))
print('Expected data:' + str(expected_data)) print("Expected data:" + str(expected_data))
self.report_error() self.report_error()
if not include_in_repro_steps: if not include_in_repro_steps:
self.repro_queries.pop() self.repro_queries.pop()
def check_data(
def check_data(self, all_data, c_range_start, c_range_end, d_range_start, d_range_end): self, all_data, c_range_start, c_range_end, d_range_start, d_range_end
for to_select in ['count()', 'sum(e)']: # Test reading with and without column with default value ):
self.query('SELECT {} FROM tab_02473;'.format(to_select), False, all_data) for to_select in [
"count()",
"sum(e)",
]: # Test reading with and without column with default value
self.query("SELECT {} FROM tab_02473;".format(to_select), False, all_data)
delta = 10 delta = 10
for b_range_start in [0, delta]: for b_range_start in [0, delta]:
for b_range_end in [self.total_rows - delta]: #, self.total_rows]: for b_range_end in [self.total_rows - delta]: # , self.total_rows]:
expected = all_data[ expected = all_data[
(all_data.a == 0) & (all_data.a == 0)
(all_data.b > b_range_start) & & (all_data.b > b_range_start)
(all_data.b <= b_range_end)] & (all_data.b <= b_range_end)
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;'.format( ]
to_select, b_range_start, b_range_end), False, expected) self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} WHERE a == 0;".format(
to_select, b_range_start, b_range_end
),
False,
expected,
)
expected = all_data[ expected = all_data[
(all_data.a == 0) & (all_data.a == 0)
(all_data.b > b_range_start) & & (all_data.b > b_range_start)
(all_data.b <= b_range_end) & & (all_data.b <= b_range_end)
(all_data.c > c_range_start) & & (all_data.c > c_range_start)
(all_data.c <= c_range_end)] & (all_data.c <= c_range_end)
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} WHERE a == 0;'.format( ]
to_select, b_range_start, b_range_end, c_range_start, c_range_end), False, expected) self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} WHERE a == 0;".format(
to_select,
b_range_start,
b_range_end,
c_range_start,
c_range_end,
),
False,
expected,
)
expected = all_data[ expected = all_data[
(all_data.a == 0) & (all_data.a == 0)
(all_data.b > b_range_start) & & (all_data.b > b_range_start)
(all_data.b <= b_range_end) & & (all_data.b <= b_range_end)
(all_data.c > c_range_start) & & (all_data.c > c_range_start)
(all_data.c <= c_range_end) & & (all_data.c <= c_range_end)
(all_data.d > d_range_start) & & (all_data.d > d_range_start)
(all_data.d <= d_range_end)] & (all_data.d <= d_range_end)
self.query('SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} AND d > {} AND d <= {} WHERE a == 0;'.format( ]
to_select, b_range_start, b_range_end, c_range_start, c_range_end, d_range_start, d_range_end), False, expected) self.query(
"SELECT {} from tab_02473 PREWHERE b > {} AND b <= {} AND c > {} AND c <= {} AND d > {} AND d <= {} WHERE a == 0;".format(
to_select,
b_range_start,
b_range_end,
c_range_start,
c_range_end,
d_range_start,
d_range_end,
),
False,
expected,
)
def run_test(self, c_range_start, c_range_end, d_range_start, d_range_end): def run_test(self, c_range_start, c_range_end, d_range_start, d_range_end):
self.repro_queries = [] self.repro_queries = []
self.query(''' self.query(
"""
CREATE TABLE tab_02473 (a Int8, b Int32, c Int32, d Int32, PRIMARY KEY (a)) CREATE TABLE tab_02473 (a Int8, b Int32, c Int32, d Int32, PRIMARY KEY (a))
ENGINE = MergeTree() ORDER BY (a, b) ENGINE = MergeTree() ORDER BY (a, b)
SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};'''.format(self.index_granularity)) SETTINGS min_bytes_for_wide_part = 0, index_granularity = {};""".format(
self.index_granularity
)
)
self.query('INSERT INTO tab_02473 select 0, number+1, number+1, number+1 FROM numbers({});'.format(self.total_rows)) self.query(
"INSERT INTO tab_02473 select 0, number+1, number+1, number+1 FROM numbers({});".format(
self.total_rows
)
)
client = ClickHouseClient() client = ClickHouseClient()
all_data = client.query_return_df("SELECT a, b, c, d, 1 as e FROM tab_02473 FORMAT TabSeparatedWithNames;") all_data = client.query_return_df(
"SELECT a, b, c, d, 1 as e FROM tab_02473 FORMAT TabSeparatedWithNames;"
)
self.query('OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;') self.query("OPTIMIZE TABLE tab_02473 FINAL SETTINGS mutations_sync=2;")
# After all data has been written add a column with default value # After all data has been written add a column with default value
self.query('ALTER TABLE tab_02473 ADD COLUMN e Int64 DEFAULT 1;') self.query("ALTER TABLE tab_02473 ADD COLUMN e Int64 DEFAULT 1;")
self.check_data(all_data, c_range_start, c_range_end, d_range_start, d_range_end) self.check_data(
all_data, c_range_start, c_range_end, d_range_start, d_range_end
self.query('DROP TABLE tab_02473;') )
self.query("DROP TABLE tab_02473;")
def main(): def main():
# Enable multiple prewhere read steps # Enable multiple prewhere read steps
url = os.environ['CLICKHOUSE_URL'] + '&enable_multiple_prewhere_read_steps=1&move_all_conditions_to_prewhere=0&max_threads=1' url = (
os.environ["CLICKHOUSE_URL"]
+ "&enable_multiple_prewhere_read_steps=1&move_all_conditions_to_prewhere=0&max_threads=1"
)
default_index_granularity = 10; default_index_granularity = 10
total_rows = 8 * default_index_granularity total_rows = 8 * default_index_granularity
step = default_index_granularity step = default_index_granularity
session = requests.Session() session = requests.Session()
for index_granularity in [default_index_granularity-1, default_index_granularity]: for index_granularity in [default_index_granularity - 1, default_index_granularity]:
tester = Tester(session, url, index_granularity, total_rows) tester = Tester(session, url, index_granularity, total_rows)
# Test combinations of ranges of columns c and d # Test combinations of ranges of columns c and d
for c_range_start in range(0, total_rows, int(2.3 * step)): for c_range_start in range(0, total_rows, int(2.3 * step)):
for c_range_end in range(c_range_start + 3 * step, total_rows, int(2.1 * step)): for c_range_end in range(
for d_range_start in range(int(0.5 * step), total_rows, int(2.7 * step)): c_range_start + 3 * step, total_rows, int(2.1 * step)
for d_range_end in range(d_range_start + 3 * step, total_rows, int(2.2 * step)): ):
tester.run_test(c_range_start, c_range_end, d_range_start, d_range_end) for d_range_start in range(
int(0.5 * step), total_rows, int(2.7 * step)
):
for d_range_end in range(
d_range_start + 3 * step, total_rows, int(2.2 * step)
):
tester.run_test(
c_range_start, c_range_end, d_range_start, d_range_end
)
if __name__ == "__main__": if __name__ == "__main__":
main() main()

View File

@ -8,7 +8,7 @@ import time
from threading import Thread from threading import Thread
CURDIR = os.path.dirname(os.path.realpath(__file__)) CURDIR = os.path.dirname(os.path.realpath(__file__))
sys.path.insert(0, os.path.join(CURDIR, 'helpers')) sys.path.insert(0, os.path.join(CURDIR, "helpers"))
from pure_http_client import ClickHouseClient from pure_http_client import ClickHouseClient
@ -16,14 +16,23 @@ client = ClickHouseClient()
# test table without partition # test table without partition
client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY") client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY")
client.query(''' client.query(
"""
CREATE TABLE t_async_insert_dedup_no_part ( CREATE TABLE t_async_insert_dedup_no_part (
KeyID UInt32 KeyID UInt32
) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}') ) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
ORDER BY (KeyID) ORDER BY (KeyID)
''') """
)
client.query("insert into t_async_insert_dedup_no_part values (1), (2), (3), (4), (5)", settings = {"async_insert": 1, "wait_for_async_insert": 1, "insert_keeper_fault_injection_probability": 0}) client.query(
"insert into t_async_insert_dedup_no_part values (1), (2), (3), (4), (5)",
settings={
"async_insert": 1,
"wait_for_async_insert": 1,
"insert_keeper_fault_injection_probability": 0,
},
)
result = client.query("select count(*) from t_async_insert_dedup_no_part") result = client.query("select count(*) from t_async_insert_dedup_no_part")
print(result, flush=True) print(result, flush=True)
client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY") client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY")
@ -32,13 +41,13 @@ client.query("DROP TABLE IF EXISTS t_async_insert_dedup_no_part NO DELAY")
def generate_data(q, total_number): def generate_data(q, total_number):
old_data = [] old_data = []
max_chunk_size = 30 max_chunk_size = 30
partitions = ['2022-11-11 10:10:10', '2022-12-12 10:10:10'] partitions = ["2022-11-11 10:10:10", "2022-12-12 10:10:10"]
last_number = 0 last_number = 0
while True: while True:
dup_simulate = random.randint(0,3) dup_simulate = random.randint(0, 3)
# insert old data randomly. 25% of them are dup. # insert old data randomly. 25% of them are dup.
if dup_simulate == 0: if dup_simulate == 0:
last_idx = len(old_data)-1 last_idx = len(old_data) - 1
if last_idx < 0: if last_idx < 0:
continue continue
idx = last_idx - random.randint(0, 50) idx = last_idx - random.randint(0, 50)
@ -53,7 +62,7 @@ def generate_data(q, total_number):
end = start + chunk_size end = start + chunk_size
if end > total_number: if end > total_number:
end = total_number end = total_number
for i in range(start, end+1): for i in range(start, end + 1):
partition = partitions[random.randint(0, 1)] partition = partitions[random.randint(0, 1)]
insert_stmt += "('{}', {}),".format(partition, i) insert_stmt += "('{}', {}),".format(partition, i)
insert_stmt = insert_stmt[:-1] insert_stmt = insert_stmt[:-1]
@ -65,33 +74,46 @@ def generate_data(q, total_number):
# wait all the tasks is done. # wait all the tasks is done.
q.join() q.join()
def fetch_and_insert_data(q, client): def fetch_and_insert_data(q, client):
while True: while True:
insert = q.get() insert = q.get()
client.query(insert, settings = {"async_insert": 1, "async_insert_deduplicate": 1, "wait_for_async_insert": 0, "async_insert_busy_timeout_ms": 1500, "insert_keeper_fault_injection_probability": 0}) client.query(
insert,
settings={
"async_insert": 1,
"async_insert_deduplicate": 1,
"wait_for_async_insert": 0,
"async_insert_busy_timeout_ms": 1500,
"insert_keeper_fault_injection_probability": 0,
},
)
q.task_done() q.task_done()
sleep_time = random.randint(50, 500) sleep_time = random.randint(50, 500)
time.sleep(sleep_time/1000.0) time.sleep(sleep_time / 1000.0)
# main process # main process
client.query("DROP TABLE IF EXISTS t_async_insert_dedup NO DELAY") client.query("DROP TABLE IF EXISTS t_async_insert_dedup NO DELAY")
client.query(''' client.query(
"""
CREATE TABLE t_async_insert_dedup ( CREATE TABLE t_async_insert_dedup (
EventDate DateTime, EventDate DateTime,
KeyID UInt32 KeyID UInt32
) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}') ) Engine = ReplicatedMergeTree('/clickhouse/tables/{shard}/{database}/t_async_insert_dedup', '{replica}')
PARTITION BY toYYYYMM(EventDate) PARTITION BY toYYYYMM(EventDate)
ORDER BY (KeyID, EventDate) SETTINGS use_async_block_ids_cache = 1 ORDER BY (KeyID, EventDate) SETTINGS use_async_block_ids_cache = 1
''') """
)
q = queue.Queue(100) q = queue.Queue(100)
total_number = 10000 total_number = 10000
gen = Thread(target = generate_data, args = [q, total_number]) gen = Thread(target=generate_data, args=[q, total_number])
gen.start() gen.start()
for i in range(3): for i in range(3):
insert = Thread(target = fetch_and_insert_data, args = [q, client]) insert = Thread(target=fetch_and_insert_data, args=[q, client])
insert.start() insert.start()
gen.join() gen.join()
@ -109,7 +131,7 @@ while True:
errMsg = f"the size of result is {len(result)}. we expect {total_number}." errMsg = f"the size of result is {len(result)}. we expect {total_number}."
else: else:
for i in range(total_number): for i in range(total_number):
expect = str(i+1) expect = str(i + 1)
real = result[i] real = result[i]
if expect != real: if expect != real:
err = True err = True
@ -117,7 +139,7 @@ while True:
break break
# retry several times to get stable results. # retry several times to get stable results.
if err and retry >= 5: if err and retry >= 5:
print (errMsg, flush=True) print(errMsg, flush=True)
elif err: elif err:
retry += 1 retry += 1
continue continue
@ -125,11 +147,15 @@ while True:
print(len(result), flush=True) print(len(result), flush=True)
break break
result = client.query("SELECT value FROM system.metrics where metric = 'AsyncInsertCacheSize'") result = client.query(
"SELECT value FROM system.metrics where metric = 'AsyncInsertCacheSize'"
)
result = int(result.split()[0]) result = int(result.split()[0])
if result <= 0: if result <= 0:
raise Exception(f"AsyncInsertCacheSize should > 0, but got {result}") raise Exception(f"AsyncInsertCacheSize should > 0, but got {result}")
result = client.query("SELECT value FROM system.events where event = 'AsyncInsertCacheHits'") result = client.query(
"SELECT value FROM system.events where event = 'AsyncInsertCacheHits'"
)
result = int(result.split()[0]) result = int(result.split()[0])
if result <= 0: if result <= 0:
raise Exception(f"AsyncInsertCacheHits should > 0, but got {result}") raise Exception(f"AsyncInsertCacheHits should > 0, but got {result}")

View File

@ -19,9 +19,9 @@ import tenacity
import xmltodict import xmltodict
import yaml import yaml
SELECT_VERSION = r'SELECT version()' SELECT_VERSION = r"SELECT version()"
SELECT_UPTIME = r''' SELECT_UPTIME = r"""
{% if version_ge('21.3') -%} {% if version_ge('21.3') -%}
SELECT formatReadableTimeDelta(uptime()) SELECT formatReadableTimeDelta(uptime())
{% else -%} {% else -%}
@ -29,18 +29,18 @@ SELECT
toString(floor(uptime() / 3600 / 24)) || ' days ' || toString(floor(uptime() / 3600 / 24)) || ' days ' ||
toString(floor(uptime() % (24 * 3600) / 3600, 1)) || ' hours' toString(floor(uptime() % (24 * 3600) / 3600, 1)) || ' hours'
{% endif -%} {% endif -%}
''' """
SELECT_SYSTEM_TABLES = "SELECT name FROM system.tables WHERE database = 'system'" SELECT_SYSTEM_TABLES = "SELECT name FROM system.tables WHERE database = 'system'"
SELECT_DATABASE_ENGINES = r'''SELECT SELECT_DATABASE_ENGINES = r"""SELECT
engine, engine,
count() "count" count() "count"
FROM system.databases FROM system.databases
GROUP BY engine GROUP BY engine
''' """
SELECT_DATABASES = r'''SELECT SELECT_DATABASES = r"""SELECT
name, name,
engine, engine,
tables, tables,
@ -62,17 +62,17 @@ LEFT JOIN
) AS db_stats ON db.name = db_stats.database ) AS db_stats ON db.name = db_stats.database
ORDER BY bytes_on_disk DESC ORDER BY bytes_on_disk DESC
LIMIT 10 LIMIT 10
''' """
SELECT_TABLE_ENGINES = r'''SELECT SELECT_TABLE_ENGINES = r"""SELECT
engine, engine,
count() "count" count() "count"
FROM system.tables FROM system.tables
WHERE database != 'system' WHERE database != 'system'
GROUP BY engine GROUP BY engine
''' """
SELECT_DICTIONARIES = r'''SELECT SELECT_DICTIONARIES = r"""SELECT
source, source,
type, type,
status, status,
@ -80,13 +80,13 @@ SELECT_DICTIONARIES = r'''SELECT
FROM system.dictionaries FROM system.dictionaries
GROUP BY source, type, status GROUP BY source, type, status
ORDER BY status DESC, source ORDER BY status DESC, source
''' """
SELECT_ACCESS = "SHOW ACCESS" SELECT_ACCESS = "SHOW ACCESS"
SELECT_QUOTA_USAGE = "SHOW QUOTA" SELECT_QUOTA_USAGE = "SHOW QUOTA"
SELECT_REPLICAS = r'''SELECT SELECT_REPLICAS = r"""SELECT
database, database,
table, table,
is_leader, is_leader,
@ -98,9 +98,9 @@ SELECT_REPLICAS = r'''SELECT
FROM system.replicas FROM system.replicas
ORDER BY absolute_delay DESC ORDER BY absolute_delay DESC
LIMIT 10 LIMIT 10
''' """
SELECT_REPLICATION_QUEUE = r'''SELECT SELECT_REPLICATION_QUEUE = r"""SELECT
database, database,
table, table,
replica_name, replica_name,
@ -121,9 +121,9 @@ SELECT_REPLICATION_QUEUE = r'''SELECT
FROM system.replication_queue FROM system.replication_queue
ORDER BY create_time ASC ORDER BY create_time ASC
LIMIT 20 LIMIT 20
''' """
SELECT_REPLICATED_FETCHES = r'''SELECT SELECT_REPLICATED_FETCHES = r"""SELECT
database, database,
table, table,
round(elapsed, 1) "elapsed", round(elapsed, 1) "elapsed",
@ -140,9 +140,9 @@ SELECT_REPLICATED_FETCHES = r'''SELECT
to_detached, to_detached,
thread_id thread_id
FROM system.replicated_fetches FROM system.replicated_fetches
''' """
SELECT_PARTS_PER_TABLE = r'''SELECT SELECT_PARTS_PER_TABLE = r"""SELECT
database, database,
table, table,
count() "partitions", count() "partitions",
@ -162,9 +162,9 @@ FROM
GROUP BY database, table GROUP BY database, table
ORDER BY max_parts_per_partition DESC ORDER BY max_parts_per_partition DESC
LIMIT 10 LIMIT 10
''' """
SELECT_MERGES = r'''SELECT SELECT_MERGES = r"""SELECT
database, database,
table, table,
round(elapsed, 1) "elapsed", round(elapsed, 1) "elapsed",
@ -187,9 +187,9 @@ SELECT_MERGES = r'''SELECT
formatReadableSize(memory_usage) "memory_usage" formatReadableSize(memory_usage) "memory_usage"
{% endif -%} {% endif -%}
FROM system.merges FROM system.merges
''' """
SELECT_MUTATIONS = r'''SELECT SELECT_MUTATIONS = r"""SELECT
database, database,
table, table,
mutation_id, mutation_id,
@ -206,9 +206,9 @@ SELECT_MUTATIONS = r'''SELECT
FROM system.mutations FROM system.mutations
WHERE NOT is_done WHERE NOT is_done
ORDER BY create_time DESC ORDER BY create_time DESC
''' """
SELECT_RECENT_DATA_PARTS = r'''SELECT SELECT_RECENT_DATA_PARTS = r"""SELECT
database, database,
table, table,
engine, engine,
@ -242,9 +242,9 @@ SELECT_RECENT_DATA_PARTS = r'''SELECT
FROM system.parts FROM system.parts
WHERE modification_time > now() - INTERVAL 3 MINUTE WHERE modification_time > now() - INTERVAL 3 MINUTE
ORDER BY modification_time DESC ORDER BY modification_time DESC
''' """
SELECT_DETACHED_DATA_PARTS = r'''SELECT SELECT_DETACHED_DATA_PARTS = r"""SELECT
database, database,
table, table,
partition_id, partition_id,
@ -255,9 +255,9 @@ SELECT_DETACHED_DATA_PARTS = r'''SELECT
max_block_number, max_block_number,
level level
FROM system.detached_parts FROM system.detached_parts
''' """
SELECT_PROCESSES = r'''SELECT SELECT_PROCESSES = r"""SELECT
elapsed, elapsed,
query_id, query_id,
{% if normalize_queries -%} {% if normalize_queries -%}
@ -285,9 +285,9 @@ SELECT_PROCESSES = r'''SELECT
{% endif -%} {% endif -%}
FROM system.processes FROM system.processes
ORDER BY elapsed DESC ORDER BY elapsed DESC
''' """
SELECT_TOP_QUERIES_BY_DURATION = r'''SELECT SELECT_TOP_QUERIES_BY_DURATION = r"""SELECT
type, type,
query_start_time, query_start_time,
query_duration_ms, query_duration_ms,
@ -339,9 +339,9 @@ WHERE type != 'QueryStart'
AND event_time >= now() - INTERVAL 1 DAY AND event_time >= now() - INTERVAL 1 DAY
ORDER BY query_duration_ms DESC ORDER BY query_duration_ms DESC
LIMIT 10 LIMIT 10
''' """
SELECT_TOP_QUERIES_BY_MEMORY_USAGE = r'''SELECT SELECT_TOP_QUERIES_BY_MEMORY_USAGE = r"""SELECT
type, type,
query_start_time, query_start_time,
query_duration_ms, query_duration_ms,
@ -393,9 +393,9 @@ WHERE type != 'QueryStart'
AND event_time >= now() - INTERVAL 1 DAY AND event_time >= now() - INTERVAL 1 DAY
ORDER BY memory_usage DESC ORDER BY memory_usage DESC
LIMIT 10 LIMIT 10
''' """
SELECT_FAILED_QUERIES = r'''SELECT SELECT_FAILED_QUERIES = r"""SELECT
type, type,
query_start_time, query_start_time,
query_duration_ms, query_duration_ms,
@ -448,9 +448,9 @@ WHERE type != 'QueryStart'
AND exception != '' AND exception != ''
ORDER BY query_start_time DESC ORDER BY query_start_time DESC
LIMIT 10 LIMIT 10
''' """
SELECT_STACK_TRACES = r'''SELECT SELECT_STACK_TRACES = r"""SELECT
'\n' || arrayStringConcat( '\n' || arrayStringConcat(
arrayMap( arrayMap(
x, x,
@ -459,9 +459,9 @@ SELECT_STACK_TRACES = r'''SELECT
arrayMap(x -> demangle(addressToSymbol(x)), trace)), arrayMap(x -> demangle(addressToSymbol(x)), trace)),
'\n') AS trace '\n') AS trace
FROM system.stack_trace FROM system.stack_trace
''' """
SELECT_CRASH_LOG = r'''SELECT SELECT_CRASH_LOG = r"""SELECT
event_time, event_time,
signal, signal,
thread_id, thread_id,
@ -470,7 +470,7 @@ SELECT_CRASH_LOG = r'''SELECT
version version
FROM system.crash_log FROM system.crash_log
ORDER BY event_time DESC ORDER BY event_time DESC
''' """
def retry(exception_types, max_attempts=5, max_interval=5): def retry(exception_types, max_attempts=5, max_interval=5):
@ -481,7 +481,8 @@ def retry(exception_types, max_attempts=5, max_interval=5):
retry=tenacity.retry_if_exception_type(exception_types), retry=tenacity.retry_if_exception_type(exception_types),
wait=tenacity.wait_random_exponential(multiplier=0.5, max=max_interval), wait=tenacity.wait_random_exponential(multiplier=0.5, max=max_interval),
stop=tenacity.stop_after_attempt(max_attempts), stop=tenacity.stop_after_attempt(max_attempts),
reraise=True) reraise=True,
)
class ClickhouseError(Exception): class ClickhouseError(Exception):
@ -502,9 +503,9 @@ class ClickhouseClient:
def __init__(self, *, host="localhost", port=8123, user="default", password): def __init__(self, *, host="localhost", port=8123, user="default", password):
self._session = requests.Session() self._session = requests.Session()
if user: if user:
self._session.headers['X-ClickHouse-User'] = user self._session.headers["X-ClickHouse-User"] = user
self._session.headers['X-ClickHouse-Key'] = password self._session.headers["X-ClickHouse-Key"] = password
self._url = f'http://{host}:{port}' self._url = f"http://{host}:{port}"
self._timeout = 60 self._timeout = 60
self._ch_version = None self._ch_version = None
@ -516,7 +517,16 @@ class ClickhouseClient:
return self._ch_version return self._ch_version
@retry(requests.exceptions.ConnectionError) @retry(requests.exceptions.ConnectionError)
def query(self, query, query_args=None, format=None, post_data=None, timeout=None, echo=False, dry_run=False): def query(
self,
query,
query_args=None,
format=None,
post_data=None,
timeout=None,
echo=False,
dry_run=False,
):
""" """
Execute query. Execute query.
""" """
@ -524,28 +534,30 @@ class ClickhouseClient:
query = self.render_query(query, **query_args) query = self.render_query(query, **query_args)
if format: if format:
query += f' FORMAT {format}' query += f" FORMAT {format}"
if timeout is None: if timeout is None:
timeout = self._timeout timeout = self._timeout
if echo: if echo:
print(sqlparse.format(query, reindent=True), '\n') print(sqlparse.format(query, reindent=True), "\n")
if dry_run: if dry_run:
return None return None
try: try:
response = self._session.post(self._url, response = self._session.post(
params={ self._url,
'query': query, params={
}, "query": query,
json=post_data, },
timeout=timeout) json=post_data,
timeout=timeout,
)
response.raise_for_status() response.raise_for_status()
if format in ('JSON', 'JSONCompact'): if format in ("JSON", "JSONCompact"):
return response.json() return response.json()
return response.text.strip() return response.text.strip()
@ -555,7 +567,9 @@ class ClickhouseClient:
def render_query(self, query, **kwargs): def render_query(self, query, **kwargs):
env = jinja2.Environment() env = jinja2.Environment()
env.globals['version_ge'] = lambda version: version_ge(self.clickhouse_version, version) env.globals["version_ge"] = lambda version: version_ge(
self.clickhouse_version, version
)
template = env.from_string(query) template = env.from_string(query)
return template.render(kwargs) return template.render(kwargs)
@ -578,11 +592,13 @@ class ClickhouseConfig:
@classmethod @classmethod
def load(cls): def load(cls):
return ClickhouseConfig(cls._load_config('/var/lib/clickhouse/preprocessed_configs/config.xml')) return ClickhouseConfig(
cls._load_config("/var/lib/clickhouse/preprocessed_configs/config.xml")
)
@staticmethod @staticmethod
def _load_config(config_path): def _load_config(config_path):
with open(config_path, 'r') as file: with open(config_path, "r") as file:
return xmltodict.parse(file.read()) return xmltodict.parse(file.read())
@classmethod @classmethod
@ -591,8 +607,8 @@ class ClickhouseConfig:
for key, value in list(config.items()): for key, value in list(config.items()):
if isinstance(value, MutableMapping): if isinstance(value, MutableMapping):
cls._mask_secrets(config[key]) cls._mask_secrets(config[key])
elif key in ('password', 'secret_access_key', 'header', 'identity'): elif key in ("password", "secret_access_key", "header", "identity"):
config[key] = '*****' config[key] = "*****"
class DiagnosticsData: class DiagnosticsData:
@ -603,53 +619,53 @@ class DiagnosticsData:
def __init__(self, args): def __init__(self, args):
self.args = args self.args = args
self.host = args.host self.host = args.host
self._sections = [{'section': None, 'data': {}}] self._sections = [{"section": None, "data": {}}]
def add_string(self, name, value, section=None): def add_string(self, name, value, section=None):
self._section(section)[name] = { self._section(section)[name] = {
'type': 'string', "type": "string",
'value': value, "value": value,
} }
def add_xml_document(self, name, document, section=None): def add_xml_document(self, name, document, section=None):
self._section(section)[name] = { self._section(section)[name] = {
'type': 'xml', "type": "xml",
'value': document, "value": document,
} }
def add_query(self, name, query, result, section=None): def add_query(self, name, query, result, section=None):
self._section(section)[name] = { self._section(section)[name] = {
'type': 'query', "type": "query",
'query': query, "query": query,
'result': result, "result": result,
} }
def add_command(self, name, command, result, section=None): def add_command(self, name, command, result, section=None):
self._section(section)[name] = { self._section(section)[name] = {
'type': 'command', "type": "command",
'command': command, "command": command,
'result': result, "result": result,
} }
def dump(self, format): def dump(self, format):
if format.startswith('json'): if format.startswith("json"):
result = self._dump_json() result = self._dump_json()
elif format.startswith('yaml'): elif format.startswith("yaml"):
result = self._dump_yaml() result = self._dump_yaml()
else: else:
result = self._dump_wiki() result = self._dump_wiki()
if format.endswith('.gz'): if format.endswith(".gz"):
compressor = gzip.GzipFile(mode='wb', fileobj=sys.stdout.buffer) compressor = gzip.GzipFile(mode="wb", fileobj=sys.stdout.buffer)
compressor.write(result.encode()) compressor.write(result.encode())
else: else:
print(result) print(result)
def _section(self, name=None): def _section(self, name=None):
if self._sections[-1]['section'] != name: if self._sections[-1]["section"] != name:
self._sections.append({'section': name, 'data': {}}) self._sections.append({"section": name, "data": {}})
return self._sections[-1]['data'] return self._sections[-1]["data"]
def _dump_json(self): def _dump_json(self):
""" """
@ -669,85 +685,85 @@ class DiagnosticsData:
""" """
def _write_title(buffer, value): def _write_title(buffer, value):
buffer.write(f'### {value}\n') buffer.write(f"### {value}\n")
def _write_subtitle(buffer, value): def _write_subtitle(buffer, value):
buffer.write(f'#### {value}\n') buffer.write(f"#### {value}\n")
def _write_string_item(buffer, name, item): def _write_string_item(buffer, name, item):
value = item['value'] value = item["value"]
if value != '': if value != "":
value = f'**{value}**' value = f"**{value}**"
buffer.write(f'{name}: {value}\n') buffer.write(f"{name}: {value}\n")
def _write_xml_item(buffer, section_name, name, item): def _write_xml_item(buffer, section_name, name, item):
if section_name: if section_name:
buffer.write(f'##### {name}\n') buffer.write(f"##### {name}\n")
else: else:
_write_subtitle(buffer, name) _write_subtitle(buffer, name)
_write_result(buffer, item['value'], format='XML') _write_result(buffer, item["value"], format="XML")
def _write_query_item(buffer, section_name, name, item): def _write_query_item(buffer, section_name, name, item):
if section_name: if section_name:
buffer.write(f'##### {name}\n') buffer.write(f"##### {name}\n")
else: else:
_write_subtitle(buffer, name) _write_subtitle(buffer, name)
_write_query(buffer, item['query']) _write_query(buffer, item["query"])
_write_result(buffer, item['result']) _write_result(buffer, item["result"])
def _write_command_item(buffer, section_name, name, item): def _write_command_item(buffer, section_name, name, item):
if section_name: if section_name:
buffer.write(f'##### {name}\n') buffer.write(f"##### {name}\n")
else: else:
_write_subtitle(buffer, name) _write_subtitle(buffer, name)
_write_command(buffer, item['command']) _write_command(buffer, item["command"])
_write_result(buffer, item['result']) _write_result(buffer, item["result"])
def _write_unknown_item(buffer, section_name, name, item): def _write_unknown_item(buffer, section_name, name, item):
if section_name: if section_name:
buffer.write(f'**{name}**\n') buffer.write(f"**{name}**\n")
else: else:
_write_subtitle(buffer, name) _write_subtitle(buffer, name)
json.dump(item, buffer, indent=2) json.dump(item, buffer, indent=2)
def _write_query(buffer, query): def _write_query(buffer, query):
buffer.write('**query**\n') buffer.write("**query**\n")
buffer.write('```sql\n') buffer.write("```sql\n")
buffer.write(query) buffer.write(query)
buffer.write('\n```\n') buffer.write("\n```\n")
def _write_command(buffer, command): def _write_command(buffer, command):
buffer.write('**command**\n') buffer.write("**command**\n")
buffer.write('```\n') buffer.write("```\n")
buffer.write(command) buffer.write(command)
buffer.write('\n```\n') buffer.write("\n```\n")
def _write_result(buffer, result, format=None): def _write_result(buffer, result, format=None):
buffer.write('**result**\n') buffer.write("**result**\n")
buffer.write(f'```{format}\n' if format else '```\n') buffer.write(f"```{format}\n" if format else "```\n")
buffer.write(result) buffer.write(result)
buffer.write('\n```\n') buffer.write("\n```\n")
buffer = io.StringIO() buffer = io.StringIO()
_write_title(buffer, f'Diagnostics data for host {self.host}') _write_title(buffer, f"Diagnostics data for host {self.host}")
for section in self._sections: for section in self._sections:
section_name = section['section'] section_name = section["section"]
if section_name: if section_name:
_write_subtitle(buffer, section_name) _write_subtitle(buffer, section_name)
for name, item in section['data'].items(): for name, item in section["data"].items():
if item['type'] == 'string': if item["type"] == "string":
_write_string_item(buffer, name, item) _write_string_item(buffer, name, item)
elif item['type'] == 'query': elif item["type"] == "query":
_write_query_item(buffer, section_name, name, item) _write_query_item(buffer, section_name, name, item)
elif item['type'] == 'command': elif item["type"] == "command":
_write_command_item(buffer, section_name, name, item) _write_command_item(buffer, section_name, name, item)
elif item['type'] == 'xml': elif item["type"] == "xml":
_write_xml_item(buffer, section_name, name, item) _write_xml_item(buffer, section_name, name, item)
else: else:
_write_unknown_item(buffer, section_name, name, item) _write_unknown_item(buffer, section_name, name, item)
@ -760,126 +776,196 @@ def main():
Program entry point. Program entry point.
""" """
args = parse_args() args = parse_args()
timestamp = datetime.strftime(datetime.now(), '%Y-%m-%d %H:%M:%S') timestamp = datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
client = ClickhouseClient(host=args.host, port=args.port, user=args.user, password=args.password) client = ClickhouseClient(
host=args.host, port=args.port, user=args.user, password=args.password
)
ch_config = ClickhouseConfig.load() ch_config = ClickhouseConfig.load()
version = client.clickhouse_version version = client.clickhouse_version
system_tables = [row[0] for row in execute_query(client, SELECT_SYSTEM_TABLES, format='JSONCompact')['data']] system_tables = [
row[0]
for row in execute_query(client, SELECT_SYSTEM_TABLES, format="JSONCompact")[
"data"
]
]
diagnostics = DiagnosticsData(args) diagnostics = DiagnosticsData(args)
diagnostics.add_string('Version', version) diagnostics.add_string("Version", version)
diagnostics.add_string('Timestamp', timestamp) diagnostics.add_string("Timestamp", timestamp)
diagnostics.add_string('Uptime', execute_query(client, SELECT_UPTIME)) diagnostics.add_string("Uptime", execute_query(client, SELECT_UPTIME))
diagnostics.add_xml_document('ClickHouse configuration', ch_config.dump()) diagnostics.add_xml_document("ClickHouse configuration", ch_config.dump())
if version_ge(version, '20.8'): if version_ge(version, "20.8"):
add_query(diagnostics, 'Access configuration', add_query(
client=client, diagnostics,
query=SELECT_ACCESS, "Access configuration",
format='TSVRaw') client=client,
add_query(diagnostics, 'Quotas', query=SELECT_ACCESS,
client=client, format="TSVRaw",
query=SELECT_QUOTA_USAGE, )
format='Vertical') add_query(
diagnostics,
"Quotas",
client=client,
query=SELECT_QUOTA_USAGE,
format="Vertical",
)
add_query(diagnostics, 'Database engines', add_query(
client=client, diagnostics,
query=SELECT_DATABASE_ENGINES, "Database engines",
format='PrettyCompactNoEscapes', client=client,
section='Schema') query=SELECT_DATABASE_ENGINES,
add_query(diagnostics, 'Databases (top 10 by size)', format="PrettyCompactNoEscapes",
client=client, section="Schema",
query=SELECT_DATABASES, )
format='PrettyCompactNoEscapes', add_query(
section='Schema') diagnostics,
add_query(diagnostics, 'Table engines', "Databases (top 10 by size)",
client=client, client=client,
query=SELECT_TABLE_ENGINES, query=SELECT_DATABASES,
format='PrettyCompactNoEscapes', format="PrettyCompactNoEscapes",
section='Schema') section="Schema",
add_query(diagnostics, 'Dictionaries', )
client=client, add_query(
query=SELECT_DICTIONARIES, diagnostics,
format='PrettyCompactNoEscapes', "Table engines",
section='Schema') client=client,
query=SELECT_TABLE_ENGINES,
format="PrettyCompactNoEscapes",
section="Schema",
)
add_query(
diagnostics,
"Dictionaries",
client=client,
query=SELECT_DICTIONARIES,
format="PrettyCompactNoEscapes",
section="Schema",
)
add_query(diagnostics, 'Replicated tables (top 10 by absolute delay)', add_query(
client=client, diagnostics,
query=SELECT_REPLICAS, "Replicated tables (top 10 by absolute delay)",
format='PrettyCompactNoEscapes', client=client,
section='Replication') query=SELECT_REPLICAS,
add_query(diagnostics, 'Replication queue (top 20 oldest tasks)', format="PrettyCompactNoEscapes",
client=client, section="Replication",
query=SELECT_REPLICATION_QUEUE, )
format='Vertical', add_query(
section='Replication') diagnostics,
if version_ge(version, '21.3'): "Replication queue (top 20 oldest tasks)",
add_query(diagnostics, 'Replicated fetches', client=client,
client=client, query=SELECT_REPLICATION_QUEUE,
query=SELECT_REPLICATED_FETCHES, format="Vertical",
format='Vertical', section="Replication",
section='Replication') )
if version_ge(version, "21.3"):
add_query(
diagnostics,
"Replicated fetches",
client=client,
query=SELECT_REPLICATED_FETCHES,
format="Vertical",
section="Replication",
)
add_query(diagnostics, 'Top 10 tables by max parts per partition', add_query(
client=client, diagnostics,
query=SELECT_PARTS_PER_TABLE, "Top 10 tables by max parts per partition",
format='PrettyCompactNoEscapes') client=client,
add_query(diagnostics, 'Merges in progress', query=SELECT_PARTS_PER_TABLE,
client=client, format="PrettyCompactNoEscapes",
query=SELECT_MERGES, )
format='Vertical') add_query(
add_query(diagnostics, 'Mutations in progress', diagnostics,
client=client, "Merges in progress",
query=SELECT_MUTATIONS, client=client,
format='Vertical') query=SELECT_MERGES,
add_query(diagnostics, 'Recent data parts (modification time within last 3 minutes)', format="Vertical",
client=client, )
query=SELECT_RECENT_DATA_PARTS, add_query(
format='Vertical') diagnostics,
"Mutations in progress",
client=client,
query=SELECT_MUTATIONS,
format="Vertical",
)
add_query(
diagnostics,
"Recent data parts (modification time within last 3 minutes)",
client=client,
query=SELECT_RECENT_DATA_PARTS,
format="Vertical",
)
add_query(diagnostics, 'system.detached_parts', add_query(
client=client, diagnostics,
query=SELECT_DETACHED_DATA_PARTS, "system.detached_parts",
format='PrettyCompactNoEscapes', client=client,
section='Detached data') query=SELECT_DETACHED_DATA_PARTS,
add_command(diagnostics, 'Disk space usage', format="PrettyCompactNoEscapes",
command='du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh', section="Detached data",
section='Detached data') )
add_command(
diagnostics,
"Disk space usage",
command="du -sh -L -c /var/lib/clickhouse/data/*/*/detached/* | sort -rsh",
section="Detached data",
)
add_query(diagnostics, 'Queries in progress (process list)', add_query(
client=client, diagnostics,
query=SELECT_PROCESSES, "Queries in progress (process list)",
format='Vertical', client=client,
section='Queries') query=SELECT_PROCESSES,
add_query(diagnostics, 'Top 10 queries by duration', format="Vertical",
client=client, section="Queries",
query=SELECT_TOP_QUERIES_BY_DURATION, )
format='Vertical', add_query(
section='Queries') diagnostics,
add_query(diagnostics, 'Top 10 queries by memory usage', "Top 10 queries by duration",
client=client, client=client,
query=SELECT_TOP_QUERIES_BY_MEMORY_USAGE, query=SELECT_TOP_QUERIES_BY_DURATION,
format='Vertical', format="Vertical",
section='Queries') section="Queries",
add_query(diagnostics, 'Last 10 failed queries', )
client=client, add_query(
query=SELECT_FAILED_QUERIES, diagnostics,
format='Vertical', "Top 10 queries by memory usage",
section='Queries') client=client,
query=SELECT_TOP_QUERIES_BY_MEMORY_USAGE,
format="Vertical",
section="Queries",
)
add_query(
diagnostics,
"Last 10 failed queries",
client=client,
query=SELECT_FAILED_QUERIES,
format="Vertical",
section="Queries",
)
add_query(diagnostics, 'Stack traces', add_query(
client=client, diagnostics,
query=SELECT_STACK_TRACES, "Stack traces",
format='Vertical') client=client,
query=SELECT_STACK_TRACES,
format="Vertical",
)
if 'crash_log' in system_tables: if "crash_log" in system_tables:
add_query(diagnostics, 'Crash log', add_query(
client=client, diagnostics,
query=SELECT_CRASH_LOG, "Crash log",
format='Vertical') client=client,
query=SELECT_CRASH_LOG,
format="Vertical",
)
add_command(diagnostics, 'uname', 'uname -a') add_command(diagnostics, "uname", "uname -a")
diagnostics.dump(args.format) diagnostics.dump(args.format)
@ -889,29 +975,34 @@ def parse_args():
Parse command-line arguments. Parse command-line arguments.
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--format', parser.add_argument(
choices=['json', 'yaml', 'json.gz', 'yaml.gz', 'wiki', 'wiki.gz'], "--format",
default='wiki') choices=["json", "yaml", "json.gz", "yaml.gz", "wiki", "wiki.gz"],
parser.add_argument('--normalize-queries', default="wiki",
action='store_true', )
default=False) parser.add_argument("--normalize-queries", action="store_true", default=False)
parser.add_argument('--host', dest="host", help="clickhouse host") parser.add_argument("--host", dest="host", help="clickhouse host")
parser.add_argument('--port', dest="port", default=8123, help="clickhouse http port") parser.add_argument(
parser.add_argument('--user', dest="user", default="default", help="clickhouse user") "--port", dest="port", default=8123, help="clickhouse http port"
parser.add_argument('--password', dest="password", help="clickhouse password") )
parser.add_argument(
"--user", dest="user", default="default", help="clickhouse user"
)
parser.add_argument("--password", dest="password", help="clickhouse password")
return parser.parse_args() return parser.parse_args()
def add_query(diagnostics, name, client, query, format, section=None): def add_query(diagnostics, name, client, query, format, section=None):
query_args = { query_args = {
'normalize_queries': diagnostics.args.normalize_queries, "normalize_queries": diagnostics.args.normalize_queries,
} }
query = client.render_query(query, **query_args) query = client.render_query(query, **query_args)
diagnostics.add_query( diagnostics.add_query(
name=name, name=name,
query=query, query=query,
result=execute_query(client, query, render_query=False, format=format), result=execute_query(client, query, render_query=False, format=format),
section=section) section=section,
)
def execute_query(client, query, render_query=True, format=None): def execute_query(client, query, render_query=True, format=None):
@ -926,14 +1017,18 @@ def execute_query(client, query, render_query=True, format=None):
def add_command(diagnostics, name, command, section=None): def add_command(diagnostics, name, command, section=None):
diagnostics.add_command( diagnostics.add_command(
name=name, name=name, command=command, result=execute_command(command), section=section
command=command, )
result=execute_command(command),
section=section)
def execute_command(command, input=None): def execute_command(command, input=None):
proc = subprocess.Popen(command, shell=True, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) proc = subprocess.Popen(
command,
shell=True,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
if isinstance(input, str): if isinstance(input, str):
input = input.encode() input = input.encode()
@ -941,7 +1036,7 @@ def execute_command(command, input=None):
stdout, stderr = proc.communicate(input=input) stdout, stderr = proc.communicate(input=input)
if proc.returncode: if proc.returncode:
return f'failed with exit code {proc.returncode}\n{stderr.decode()}' return f"failed with exit code {proc.returncode}\n{stderr.decode()}"
return stdout.decode() return stdout.decode()
@ -957,8 +1052,8 @@ def parse_version(version):
""" """
Parse version string. Parse version string.
""" """
return [int(x) for x in version.strip().split('.') if x.isnumeric()] return [int(x) for x in version.strip().split(".") if x.isnumeric()]
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@ -28,39 +28,48 @@ class S3API(object):
bucket = self.connection.get_bucket(bucket_name) bucket = self.connection.get_bucket(bucket_name)
key = bucket.initiate_multipart_upload(s3_path) key = bucket.initiate_multipart_upload(s3_path)
logging.info("Will upload to s3 path %s", s3_path) logging.info("Will upload to s3 path %s", s3_path)
chunksize = 1024 * 1024 * 1024 # 1 GB chunksize = 1024 * 1024 * 1024 # 1 GB
filesize = os.stat(file_path).st_size filesize = os.stat(file_path).st_size
logging.info("File size is %s", filesize) logging.info("File size is %s", filesize)
chunkcount = int(math.ceil(filesize / chunksize)) chunkcount = int(math.ceil(filesize / chunksize))
def call_back(x, y): def call_back(x, y):
print("Uploaded {}/{} bytes".format(x, y)) print("Uploaded {}/{} bytes".format(x, y))
try: try:
for i in range(chunkcount + 1): for i in range(chunkcount + 1):
logging.info("Uploading chunk %s of %s", i, chunkcount + 1) logging.info("Uploading chunk %s of %s", i, chunkcount + 1)
offset = chunksize * i offset = chunksize * i
bytes_size = min(chunksize, filesize - offset) bytes_size = min(chunksize, filesize - offset)
with open(file_path, 'r') as fp: with open(file_path, "r") as fp:
fp.seek(offset) fp.seek(offset)
key.upload_part_from_file(fp=fp, part_num=i+1, key.upload_part_from_file(
size=bytes_size, cb=call_back, fp=fp, part_num=i + 1, size=bytes_size, cb=call_back, num_cb=100
num_cb=100) )
key.complete_upload() key.complete_upload()
except Exception as ex: except Exception as ex:
key.cancel_upload() key.cancel_upload()
raise ex raise ex
logging.info("Contents were set") logging.info("Contents were set")
return "https://{bucket}.{mds_url}/{path}".format( return "https://{bucket}.{mds_url}/{path}".format(
bucket=bucket_name, mds_url=self.mds_url, path=s3_path) bucket=bucket_name, mds_url=self.mds_url, path=s3_path
)
def set_file_contents(self, bucket, local_file_path, s3_file_path): def set_file_contents(self, bucket, local_file_path, s3_file_path):
key = Key(bucket) key = Key(bucket)
key.key = s3_file_path key.key = s3_file_path
file_size = os.stat(local_file_path).st_size file_size = os.stat(local_file_path).st_size
logging.info("Uploading file `%s` to `%s`. Size is %s", local_file_path, s3_file_path, file_size) logging.info(
"Uploading file `%s` to `%s`. Size is %s",
local_file_path,
s3_file_path,
file_size,
)
def call_back(x, y): def call_back(x, y):
print("Uploaded {}/{} bytes".format(x, y)) print("Uploaded {}/{} bytes".format(x, y))
key.set_contents_from_filename(local_file_path, cb=call_back) key.set_contents_from_filename(local_file_path, cb=call_back)
def upload_data_for_static_files_disk(self, bucket_name, directory_path, s3_path): def upload_data_for_static_files_disk(self, bucket_name, directory_path, s3_path):
@ -74,12 +83,14 @@ class S3API(object):
path = root.split(os.sep) path = root.split(os.sep)
for file in files: for file in files:
local_file_path = os.path.join(root, file) local_file_path = os.path.join(root, file)
s3_file = local_file_path[len(directory_path) + 1:] s3_file = local_file_path[len(directory_path) + 1 :]
s3_file_path = os.path.join(s3_path, s3_file) s3_file_path = os.path.join(s3_path, s3_file)
self.set_file_contents(bucket, local_file_path, s3_file_path) self.set_file_contents(bucket, local_file_path, s3_file_path)
logging.info("Uploading finished") logging.info("Uploading finished")
return "https://{bucket}.{mds_url}/{path}".format(bucket=bucket_name, mds_url=self.mds_url, path=s3_path) return "https://{bucket}.{mds_url}/{path}".format(
bucket=bucket_name, mds_url=self.mds_url, path=s3_path
)
def list_bucket_keys(self, bucket_name): def list_bucket_keys(self, bucket_name):
bucket = self.connection.get_bucket(bucket_name) bucket = self.connection.get_bucket(bucket_name)
@ -91,100 +102,121 @@ class S3API(object):
bucket.get_all_keys() bucket.get_all_keys()
for obj in bucket.get_all_keys(): for obj in bucket.get_all_keys():
if obj.key.startswith(folder_path): if obj.key.startswith(folder_path):
print('Removing ' + obj.key) print("Removing " + obj.key)
obj.delete() obj.delete()
def make_tar_file_for_table(clickhouse_data_path, db_name, table_name, def make_tar_file_for_table(clickhouse_data_path, db_name, table_name, tmp_prefix):
tmp_prefix):
relative_data_path = os.path.join('data', db_name, table_name) relative_data_path = os.path.join("data", db_name, table_name)
relative_meta_path = os.path.join('metadata', db_name, table_name + '.sql') relative_meta_path = os.path.join("metadata", db_name, table_name + ".sql")
path_to_data = os.path.join(clickhouse_data_path, relative_data_path) path_to_data = os.path.join(clickhouse_data_path, relative_data_path)
path_to_metadata = os.path.join(clickhouse_data_path, relative_meta_path) path_to_metadata = os.path.join(clickhouse_data_path, relative_meta_path)
temporary_file_name = tmp_prefix + '/{tname}.tar'.format(tname=table_name) temporary_file_name = tmp_prefix + "/{tname}.tar".format(tname=table_name)
with tarfile.open(temporary_file_name, "w") as bundle: with tarfile.open(temporary_file_name, "w") as bundle:
bundle.add(path_to_data, arcname=relative_data_path) bundle.add(path_to_data, arcname=relative_data_path)
bundle.add(path_to_metadata, arcname=relative_meta_path) bundle.add(path_to_metadata, arcname=relative_meta_path)
return temporary_file_name return temporary_file_name
USAGE_EXAMPLES = ''' USAGE_EXAMPLES = """
examples: examples:
\t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --clickhouse-data-path /opt/clickhouse/ --table-name default.some_tbl --bucket-name some-bucket \t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --clickhouse-data-path /opt/clickhouse/ --table-name default.some_tbl --bucket-name some-bucket
\t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --file-path some_ds.tsv.xz --bucket-name some-bucket --s3-path /path/to/ \t./s3uploader --dataset-name some_ds --access-key-id XXX --secret-access-key YYY --file-path some_ds.tsv.xz --bucket-name some-bucket --s3-path /path/to/
''' """
if __name__ == "__main__": if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s') logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Simple tool for uploading datasets to clickhouse S3", description="Simple tool for uploading datasets to clickhouse S3",
usage='%(prog)s [options] {}'.format(USAGE_EXAMPLES)) usage="%(prog)s [options] {}".format(USAGE_EXAMPLES),
parser.add_argument('--s3-api-url', default='s3.amazonaws.com') )
parser.add_argument('--s3-common-url', default='s3.amazonaws.com') parser.add_argument("--s3-api-url", default="s3.amazonaws.com")
parser.add_argument('--bucket-name', default='clickhouse-datasets') parser.add_argument("--s3-common-url", default="s3.amazonaws.com")
parser.add_argument('--dataset-name', required=True, parser.add_argument("--bucket-name", default="clickhouse-datasets")
help='Name of dataset, will be used in uploaded path') parser.add_argument(
parser.add_argument('--access-key-id', required=True) "--dataset-name",
parser.add_argument('--secret-access-key', required=True) required=True,
parser.add_argument('--clickhouse-data-path', help="Name of dataset, will be used in uploaded path",
default='/var/lib/clickhouse/', )
help='Path to clickhouse database on filesystem') parser.add_argument("--access-key-id", required=True)
parser.add_argument('--s3-path', help='Path in s3, where to upload file') parser.add_argument("--secret-access-key", required=True)
parser.add_argument('--tmp-prefix', default='/tmp', parser.add_argument(
help='Prefix to store temporary downloaded file') "--clickhouse-data-path",
default="/var/lib/clickhouse/",
help="Path to clickhouse database on filesystem",
)
parser.add_argument("--s3-path", help="Path in s3, where to upload file")
parser.add_argument(
"--tmp-prefix", default="/tmp", help="Prefix to store temporary downloaded file"
)
data_group = parser.add_mutually_exclusive_group(required=True) data_group = parser.add_mutually_exclusive_group(required=True)
table_name_argument = data_group.add_argument('--table-name', table_name_argument = data_group.add_argument(
help='Name of table with database, if you are uploading partitions') "--table-name",
data_group.add_argument('--file-path', help="Name of table with database, if you are uploading partitions",
help='Name of file, if you are uploading') )
data_group.add_argument('--directory-path', help='Path to directory with files to upload') data_group.add_argument("--file-path", help="Name of file, if you are uploading")
data_group.add_argument('--list-directory', help='List s3 directory by --directory-path') data_group.add_argument(
data_group.add_argument('--remove-directory', help='Remove s3 directory by --directory-path') "--directory-path", help="Path to directory with files to upload"
)
data_group.add_argument(
"--list-directory", help="List s3 directory by --directory-path"
)
data_group.add_argument(
"--remove-directory", help="Remove s3 directory by --directory-path"
)
args = parser.parse_args() args = parser.parse_args()
if args.table_name is not None and args.clickhouse_data_path is None: if args.table_name is not None and args.clickhouse_data_path is None:
raise argparse.ArgumentError(table_name_argument, raise argparse.ArgumentError(
"You should specify --clickhouse-data-path to upload --table") table_name_argument,
"You should specify --clickhouse-data-path to upload --table",
)
s3_conn = S3API( s3_conn = S3API(
args.access_key_id, args.secret_access_key, args.access_key_id, args.secret_access_key, args.s3_api_url, args.s3_common_url
args.s3_api_url, args.s3_common_url) )
file_path = '' file_path = ""
directory_path = args.directory_path directory_path = args.directory_path
s3_path = args.s3_path s3_path = args.s3_path
if args.list_directory: if args.list_directory:
s3_conn.list_bucket_keys(args.bucket_name) s3_conn.list_bucket_keys(args.bucket_name)
elif args.remove_directory: elif args.remove_directory:
print('Removing s3 path: ' + args.remove_directory) print("Removing s3 path: " + args.remove_directory)
s3_conn.remove_folder_from_bucket(args.bucket_name, args.remove_directory) s3_conn.remove_folder_from_bucket(args.bucket_name, args.remove_directory)
elif args.directory_path is not None: elif args.directory_path is not None:
url = s3_conn.upload_data_for_static_files_disk(args.bucket_name, directory_path, s3_path) url = s3_conn.upload_data_for_static_files_disk(
args.bucket_name, directory_path, s3_path
)
logging.info("Data uploaded: %s", url) logging.info("Data uploaded: %s", url)
else: else:
if args.table_name is not None: if args.table_name is not None:
if '.' not in args.table_name: if "." not in args.table_name:
db_name = 'default' db_name = "default"
else: else:
db_name, table_name = args.table_name.split('.') db_name, table_name = args.table_name.split(".")
file_path = make_tar_file_for_table( file_path = make_tar_file_for_table(
args.clickhouse_data_path, db_name, table_name, args.tmp_prefix) args.clickhouse_data_path, db_name, table_name, args.tmp_prefix
)
else: else:
file_path = args.file_path file_path = args.file_path
if 'tsv' in file_path: if "tsv" in file_path:
s3_path = os.path.join( s3_path = os.path.join(
args.dataset_name, 'tsv', os.path.basename(file_path)) args.dataset_name, "tsv", os.path.basename(file_path)
)
if args.table_name is not None: if args.table_name is not None:
s3_path = os.path.join( s3_path = os.path.join(
args.dataset_name, 'partitions', os.path.basename(file_path)) args.dataset_name, "partitions", os.path.basename(file_path)
)
elif args.s3_path is not None: elif args.s3_path is not None:
s3_path = os.path.join( s3_path = os.path.join(
args.dataset_name, args.s3_path, os.path.basename(file_path)) args.dataset_name, args.s3_path, os.path.basename(file_path)
)
else: else:
raise Exception("Don't know s3-path to upload") raise Exception("Don't know s3-path to upload")

View File

@ -11,13 +11,14 @@ from termcolor import colored
import sys import sys
COLORMAP = { COLORMAP = {
"success": colored("success", 'green'), "success": colored("success", "green"),
"failure": colored("failure", 'red'), "failure": colored("failure", "red"),
"error": colored("error", 'red'), "error": colored("error", "red"),
"pending": colored("pending", 'yellow'), "pending": colored("pending", "yellow"),
"not run": colored("not run", 'white'), "not run": colored("not run", "white"),
} }
def _filter_statuses(statuses): def _filter_statuses(statuses):
""" """
Squash statuses to latest state Squash statuses to latest state
@ -69,7 +70,7 @@ if __name__ == "__main__":
date_since = datetime.datetime.strptime(args.since, "%Y-%m-%d %H:%M:%S") date_since = datetime.datetime.strptime(args.since, "%Y-%m-%d %H:%M:%S")
gh = Github(args.token) gh = Github(args.token)
repo = gh.get_repo('ClickHouse/ClickHouse') repo = gh.get_repo("ClickHouse/ClickHouse")
commits = get_commits(repo, date_since) commits = get_commits(repo, date_since)
longest_header = [] longest_header = []
@ -101,6 +102,6 @@ if __name__ == "__main__":
result_data.append(current_result) result_data.append(current_result)
if sys.stdout.isatty(): if sys.stdout.isatty():
longest_header = [colored(h, 'white', attrs=['bold']) for h in longest_header] longest_header = [colored(h, "white", attrs=["bold"]) for h in longest_header]
print(tabulate.tabulate(result_data, headers=longest_header, tablefmt="grid")) print(tabulate.tabulate(result_data, headers=longest_header, tablefmt="grid"))