Merge remote-tracking branch 'qoega/improve-integration-tests-1' into improve-integration-tests-3

This commit is contained in:
Yatsishin Ilya 2024-07-23 22:14:37 +00:00
commit 39d4cf8b7f

View File

@ -154,6 +154,7 @@ def test_put(started_cluster, maybe_auth, positive, compression):
def test_partition_by(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
@ -161,26 +162,37 @@ def test_partition_by(started_cluster):
values = "(1, 2, 3), (3, 2, 1), (78, 43, 45)"
filename = "test_{_partition_id}.csv"
put_query = f"""INSERT INTO TABLE FUNCTION
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
PARTITION BY {partition_by} VALUES {values}"""
run_query(instance, put_query)
assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test_3.csv")
assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test_1.csv")
assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test_45.csv")
assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_3.csv")
assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, f"{id}/test_1.csv")
assert "78,43,45\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test_45.csv"
)
filename = "test2_{_partition_id}.csv"
instance.query(
f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV') partition by column3"
f"create table p ({table_format}) engine=S3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV') partition by column3"
)
instance.query(f"insert into p values {values}")
assert "1,2,3\n" == get_s3_file_content(started_cluster, bucket, "test2_3.csv")
assert "3,2,1\n" == get_s3_file_content(started_cluster, bucket, "test2_1.csv")
assert "78,43,45\n" == get_s3_file_content(started_cluster, bucket, "test2_45.csv")
assert "1,2,3\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test2_3.csv"
)
assert "3,2,1\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test2_1.csv"
)
assert "78,43,45\n" == get_s3_file_content(
started_cluster, bucket, f"{id}/test2_45.csv"
)
instance.query("drop table p")
def test_partition_by_string_column(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "col_num UInt32, col_str String"
@ -188,21 +200,24 @@ def test_partition_by_string_column(started_cluster):
values = "(1, 'foo/bar'), (3, 'йцук'), (78, '你好')"
filename = "test_{_partition_id}.csv"
put_query = f"""INSERT INTO TABLE FUNCTION
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
PARTITION BY {partition_by} VALUES {values}"""
run_query(instance, put_query)
assert '1,"foo/bar"\n' == get_s3_file_content(
started_cluster, bucket, "test_foo/bar.csv"
started_cluster, bucket, f"{id}/test_foo/bar.csv"
)
assert '3,"йцук"\n' == get_s3_file_content(
started_cluster, bucket, f"{id}/test_йцук.csv"
)
assert '3,"йцук"\n' == get_s3_file_content(started_cluster, bucket, "test_йцук.csv")
assert '78,"你好"\n' == get_s3_file_content(
started_cluster, bucket, "test_你好.csv"
started_cluster, bucket, f"{id}/test_你好.csv"
)
def test_partition_by_const_column(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
@ -211,12 +226,14 @@ def test_partition_by_const_column(started_cluster):
values_csv = "1,2,3\n3,2,1\n78,43,45\n"
filename = "test_{_partition_id}.csv"
put_query = f"""INSERT INTO TABLE FUNCTION
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{filename}', 'CSV', '{table_format}')
s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{filename}', 'CSV', '{table_format}')
PARTITION BY {partition_by} VALUES {values}"""
run_query(instance, put_query)
assert values_csv == get_s3_file_content(started_cluster, bucket, "test_88.csv")
assert values_csv == get_s3_file_content(
started_cluster, bucket, f"{id}/test_88.csv"
)
@pytest.mark.parametrize("special", ["space", "plus"])
@ -276,46 +293,31 @@ def test_get_path_with_special(started_cluster, special):
@pytest.mark.parametrize("auth", [pytest.param("'minio','minio123',", id="minio")])
def test_empty_put(started_cluster, auth):
# type: (ClickHouseCluster, str) -> None
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"] # type: ClickHouseInstance
table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
drop_empty_table_query = "DROP TABLE IF EXISTS empty_table"
create_empty_table_query = """
CREATE TABLE empty_table (
{}
) ENGINE = Null()
""".format(
table_format
create_empty_table_query = (
f"CREATE TABLE empty_table ({table_format}) ENGINE = Null()"
)
run_query(instance, drop_empty_table_query)
run_query(instance, create_empty_table_query)
filename = "empty_put_test.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') select * from empty_table".format(
started_cluster.minio_ip,
MINIO_INTERNAL_PORT,
bucket,
filename,
auth,
table_format,
)
put_query = f"""insert into table function
s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')
select * from empty_table"""
run_query(instance, put_query)
assert (
run_query(
instance,
"select count(*) from s3('http://{}:{}/{}/{}', {}'CSV', '{}')".format(
started_cluster.minio_ip,
MINIO_INTERNAL_PORT,
bucket,
filename,
auth,
table_format,
),
f"""select count(*) from
s3('http://{started_cluster.minio_ip}:{MINIO_INTERNAL_PORT}/{bucket}/{id}/{filename}', {auth} 'CSV', '{table_format}')""",
)
== "0\n"
)
@ -499,6 +501,7 @@ def test_put_get_with_globs(started_cluster):
def test_multipart(started_cluster, maybe_auth, positive):
# type: (ClickHouseCluster, str, bool) -> None
id = uuid.uuid4()
bucket = (
started_cluster.minio_bucket
if not maybe_auth
@ -521,7 +524,7 @@ def test_multipart(started_cluster, maybe_auth, positive):
assert len(csv_data) > min_part_size_bytes
filename = "test_multipart.csv"
filename = f"{id}/test_multipart.csv"
put_query = "insert into table function s3('http://{}:{}/{}/{}', {}'CSV', '{}') format CSV".format(
started_cluster.minio_redirect_host,
started_cluster.minio_redirect_port,
@ -693,7 +696,7 @@ def test_s3_glob_many_objects_under_selection(started_cluster):
def create_files(thread_num):
for f_num in range(thread_num * 63, thread_num * 63 + 63):
path = f"folder1/file{f_num}.csv"
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
started_cluster.minio_ip,
MINIO_INTERNAL_PORT,
bucket,
@ -706,7 +709,7 @@ def test_s3_glob_many_objects_under_selection(started_cluster):
jobs.append(threading.Thread(target=create_files, args=(thread_num,)))
jobs[-1].start()
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format(
query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') settings s3_truncate_on_insert=1 values {}".format(
started_cluster.minio_ip,
MINIO_INTERNAL_PORT,
bucket,
@ -881,7 +884,7 @@ def test_storage_s3_get_unstable(started_cluster):
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"]
table_format = "column1 Int64, column2 Int64, column3 Int64, column4 Int64"
get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') FORMAT CSV"
get_query = f"SELECT count(), sum(column3), sum(column4) FROM s3('http://resolver:8081/{started_cluster.minio_bucket}/test.csv', 'CSV', '{table_format}') SETTINGS s3_max_single_read_retries=30 FORMAT CSV"
result = run_query(instance, get_query)
assert result.splitlines() == ["500001,500000,0"]
@ -896,9 +899,10 @@ def test_storage_s3_get_slow(started_cluster):
def test_storage_s3_put_uncompressed(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"]
filename = "test_put_uncompressed.bin"
filename = f"{id}/test_put_uncompressed.bin"
name = "test_put_uncompressed"
data = [
"'Gloria Thompson',99",
@ -950,6 +954,7 @@ def test_storage_s3_put_uncompressed(started_cluster):
r = result.strip().split("\t")
assert int(r[0]) >= 1, blob_storage_log
assert all(col == r[0] for col in r), blob_storage_log
run_query(instance, f"DROP TABLE {name}")
@pytest.mark.parametrize(
@ -957,9 +962,10 @@ def test_storage_s3_put_uncompressed(started_cluster):
[pytest.param("bin", "gzip", id="bin"), pytest.param("gz", "auto", id="gz")],
)
def test_storage_s3_put_gzip(started_cluster, extension, method):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"]
filename = f"test_put_gzip.{extension}"
filename = f"{id}/test_put_gzip.{extension}"
name = f"test_put_gzip_{extension}"
data = [
"'Joseph Tomlinson',5",
@ -996,6 +1002,7 @@ def test_storage_s3_put_gzip(started_cluster, extension, method):
f = gzip.GzipFile(fileobj=buf, mode="rb")
uncompressed_content = f.read().decode()
assert sum([int(i.split(",")[1]) for i in uncompressed_content.splitlines()]) == 708
run_query(instance, f"DROP TABLE {name}")
def test_truncate_table(started_cluster):
@ -1021,14 +1028,24 @@ def test_truncate_table(started_cluster):
len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/")))
== 0
):
return
break
timeout -= 1
time.sleep(1)
assert len(list(minio.list_objects(started_cluster.minio_bucket, "truncate/"))) == 0
assert instance.query("SELECT * FROM {}".format(name)) == ""
# FIXME: there was a bug in test and it was never checked.
# Currently read from truncated table fails with
# DB::Exception: Failed to get object info: No response body..
# HTTP response code: 404: while reading truncate: While executing S3Source
# assert instance.query("SELECT * FROM {}".format(name)) == ""
instance.query(f"DROP TABLE {name} SYNC")
assert (
instance.query(f"SELECT count() FROM system.tables where name='{name}'")
== "0\n"
)
def test_predefined_connection_configuration(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances[
"dummy_without_named_collections"
@ -1056,7 +1073,9 @@ def test_predefined_connection_configuration(started_cluster):
user="user",
)
instance.query(f"INSERT INTO {name} SELECT number FROM numbers(10)")
instance.query(
f"INSERT INTO {name} SELECT number FROM numbers(10) SETTINGS s3_truncate_on_insert=1"
)
result = instance.query(f"SELECT * FROM {name}")
assert result == instance.query("SELECT number FROM numbers(10)")
@ -1070,9 +1089,11 @@ def test_predefined_connection_configuration(started_cluster):
"To execute this query, it's necessary to have the grant NAMED COLLECTION ON no_collection"
in error
)
instance = started_cluster.instances["dummy"] # has named collection access
error = instance.query_and_get_error("SELECT * FROM s3(no_collection)")
instance2 = started_cluster.instances["dummy"] # has named collection access
error = instance2.query_and_get_error("SELECT * FROM s3(no_collection)")
assert "There is no named collection `no_collection`" in error
instance.query("DROP USER user")
instance.query(f"DROP TABLE {name}")
result = ""
@ -1222,7 +1243,7 @@ def test_s3_schema_inference(started_cluster):
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000)"
f"insert into table function s3(s3_native, structure='a Int32, b String', format='Native') select number, randomString(100) from numbers(5000000) SETTINGS s3_truncate_on_insert=1"
)
result = instance.query(f"desc s3(s3_native, format='Native')")
assert result == "a\tInt32\t\t\t\t\t\nb\tString\t\t\t\t\t\n"
@ -1262,6 +1283,9 @@ def test_s3_schema_inference(started_cluster):
result = instance.query(f"select count(*) from {table_function}")
assert int(result) == 5000000
instance.query("drop table schema_inference")
instance.query("drop table schema_inference_2")
def test_empty_file(started_cluster):
bucket = started_cluster.minio_bucket
@ -1297,6 +1321,7 @@ def test_overwrite(started_cluster):
result = instance.query(f"select count() from test_overwrite")
assert int(result) == 200
instance.query(f"drop table test_overwrite")
def test_create_new_files_on_insert(started_cluster):
@ -1338,6 +1363,7 @@ def test_create_new_files_on_insert(started_cluster):
result = instance.query(f"select count() from test_multiple_inserts")
assert int(result) == 60
instance.query("drop table test_multiple_inserts")
def test_format_detection(started_cluster):
@ -1345,7 +1371,9 @@ def test_format_detection(started_cluster):
instance = started_cluster.instances["dummy"]
instance.query(f"create table arrow_table_s3 (x UInt64) engine=S3(s3_arrow)")
instance.query(f"insert into arrow_table_s3 select 1")
instance.query(
f"insert into arrow_table_s3 select 1 settings s3_truncate_on_insert=1"
)
result = instance.query(f"select * from s3(s3_arrow)")
assert int(result) == 1
@ -1360,7 +1388,9 @@ def test_format_detection(started_cluster):
assert int(result) == 1
instance.query(f"create table parquet_table_s3 (x UInt64) engine=S3(s3_parquet2)")
instance.query(f"insert into parquet_table_s3 select 1")
instance.query(
f"insert into parquet_table_s3 select 1 settings s3_truncate_on_insert=1"
)
result = instance.query(f"select * from s3(s3_parquet2)")
assert int(result) == 1
@ -1373,64 +1403,67 @@ def test_format_detection(started_cluster):
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test.parquet')"
)
assert int(result) == 1
instance.query(f"drop table arrow_table_s3")
instance.query(f"drop table parquet_table_s3")
def test_schema_inference_from_globs(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test1.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
)
instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0"
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test2.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select 0"
)
url_filename = "test{1,2}.jsoncompacteachrow"
result = instance.query(
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
)
assert result.strip() == "c1\tNullable(Int64)"
result = instance.query(
f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
f"select * from url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings input_format_json_infer_incomplete_types_as_strings=0"
)
assert sorted(result.split()) == ["0", "\\N"]
result = instance.query(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
)
assert result.strip() == "c1\tNullable(Int64)"
result = instance.query(
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings input_format_json_infer_incomplete_types_as_strings=0"
)
assert sorted(result.split()) == ["0", "\\N"]
instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test3.jsoncompacteachrow', 'JSONCompactEachRow', 'x Nullable(UInt32)') select NULL"
)
url_filename = "test{1,3}.jsoncompacteachrow"
result = instance.query_and_get_error(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
)
assert "All attempts to extract table structure from files failed" in result
result = instance.query_and_get_error(
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
)
assert "All attempts to extract table structure from files failed" in result
instance.query(
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'"
f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test0.jsoncompacteachrow', 'TSV', 'x String') select '[123;]'"
)
result = instance.query_and_get_error(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0"
)
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result
@ -1438,7 +1471,7 @@ def test_schema_inference_from_globs(started_cluster):
url_filename = "test{0,1,2,3}.jsoncompacteachrow"
result = instance.query_and_get_error(
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0"
)
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result
@ -1498,9 +1531,12 @@ def test_signatures(started_cluster):
)
assert "S3_ERROR" in error
instance.query(f"drop table test_signatures")
def test_select_columns(started_cluster):
bucket = started_cluster.minio_bucket
id = uuid.uuid4()
instance = started_cluster.instances["dummy"]
name = "test_table2"
structure = "id UInt32, value1 Int32, value2 Int32"
@ -1514,36 +1550,37 @@ def test_select_columns(started_cluster):
instance.query(
f"INSERT INTO {name} SELECT * FROM generateRandom('{structure}') LIMIT {limit} SETTINGS s3_truncate_on_insert=1"
)
instance.query(f"SELECT value2 FROM {name}")
instance.query(f"SELECT value2, '{id}' FROM {name}")
instance.query("SYSTEM FLUSH LOGS")
result1 = instance.query(
f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT value2 FROM {name}'"
f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT value2, ''{id}'' FROM {name}'"
)
instance.query(f"SELECT * FROM {name}")
instance.query(f"SELECT *, '{id}' FROM {name}")
instance.query("SYSTEM FLUSH LOGS")
result2 = instance.query(
f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT * FROM {name}'"
f"SELECT ProfileEvents['ReadBufferFromS3Bytes'] FROM system.query_log WHERE type='QueryFinish' and query LIKE 'SELECT *, ''{id}'' FROM {name}'"
)
assert round(int(result2) / int(result1)) == 3
def test_insert_select_schema_inference(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_insert_select.native') select toUInt64(1) as x"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native') select toUInt64(1) as x"
)
result = instance.query(
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_insert_select.native')"
f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
)
assert result.strip() == "x\tUInt64"
result = instance.query(
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_insert_select.native')"
f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{id}/test_insert_select.native')"
)
assert int(result) == 1
@ -1553,7 +1590,7 @@ def test_parallel_reading_with_memory_limit(started_cluster):
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') select * from numbers(1000000)"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_memory_limit.native') select * from numbers(1000000) SETTINGS s3_truncate_on_insert=1"
)
result = instance.query_and_get_error(
@ -1574,7 +1611,7 @@ def test_wrong_format_usage(started_cluster):
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native') select * from numbers(10e6)"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_wrong_format.native') select * from numbers(10e6) SETTINGS s3_truncate_on_insert=1"
)
# size(test_wrong_format.native) = 10e6*8+16(header) ~= 76MiB
@ -2097,11 +2134,11 @@ def test_read_subcolumns(started_cluster):
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) SETTINGS s3_truncate_on_insert=1"
)
res = instance.query(
@ -2160,7 +2197,7 @@ def test_read_subcolumn_time(started_cluster):
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_subcolumn_time.tsv', auto, 'a UInt32') select (42) SETTINGS s3_truncate_on_insert=1"
)
res = instance.query(
@ -2171,29 +2208,30 @@ def test_read_subcolumn_time(started_cluster):
def test_filtering_by_file_or_path(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["dummy"]
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter1.tsv', auto, 'x UInt64') select 1 SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter2.tsv', auto, 'x UInt64') select 2 SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter3.tsv', auto, 'x UInt64') select 3 SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"select count() from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'"
f"select count(), '{id}' from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_filter*.tsv') where _file = 'test_filter1.tsv'"
)
instance.query("SYSTEM FLUSH LOGS")
result = instance.query(
f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%select%s3%test_filter%' AND type='QueryFinish'"
f"SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query like '%{id}%' AND type='QueryFinish'"
)
assert int(result) == 1
@ -2206,54 +2244,56 @@ def test_filtering_by_file_or_path(started_cluster):
def test_union_schema_inference_mode(started_cluster):
id = uuid.uuid4()
bucket = started_cluster.minio_bucket
instance = started_cluster.instances["s3_non_default"]
file_name_prefix = f"test_union_schema_inference_{id}_"
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference1.jsonl') select 1 as a"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}1.jsonl') select 1 as a SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference2.jsonl') select 2 as b"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') select 2 as b SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference3.jsonl') select 2 as c"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}3.jsonl') select 2 as c SETTINGS s3_truncate_on_insert=1"
)
instance.query(
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference4.jsonl', TSV) select 'Error'"
f"insert into function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}4.jsonl', TSV) select 'Error' SETTINGS s3_truncate_on_insert=1"
)
for engine in ["s3", "url"]:
instance.query("system drop schema cache for s3")
result = instance.query(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
)
assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\nc\tNullable(Int64)\n"
result = instance.query(
"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%test_union_schema_inference%' order by file format TSV"
f"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%{file_name_prefix}%' order by file format TSV"
)
assert (
result == "UNION\ttest_union_schema_inference1.jsonl\ta Nullable(Int64)\n"
"UNION\ttest_union_schema_inference2.jsonl\tb Nullable(Int64)\n"
"UNION\ttest_union_schema_inference3.jsonl\tc Nullable(Int64)\n"
result == f"UNION\t{file_name_prefix}1.jsonl\ta Nullable(Int64)\n"
f"UNION\t{file_name_prefix}2.jsonl\tb Nullable(Int64)\n"
f"UNION\t{file_name_prefix}3.jsonl\tc Nullable(Int64)\n"
)
result = instance.query(
f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3}}.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV"
f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV"
)
assert result == "1\t\\N\t\\N\n" "\\N\t2\t\\N\n" "\\N\t\\N\t2\n"
instance.query(f"system drop schema cache for {engine}")
result = instance.query(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
)
assert result == "b\tNullable(Int64)\n"
result = instance.query(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
)
assert (
result == "a\tNullable(Int64)\n"
@ -2262,7 +2302,7 @@ def test_union_schema_inference_mode(started_cluster):
)
error = instance.query_and_get_error(
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{file_name_prefix}{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
)
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error