Fix deprecated HDFS image and harden test_storage_hdfs.

This commit is contained in:
Yatsishin Ilya 2024-07-24 19:55:03 +00:00
parent d296e62bf3
commit 9c28c64adf
2 changed files with 87 additions and 49 deletions

View File

@ -1,7 +1,7 @@
version: '2.3' version: '2.3'
services: services:
hdfs1: hdfs1:
image: sequenceiq/hadoop-docker:2.7.0 image: prasanthj/docker-hadoop:2.6.0
hostname: hdfs1 hostname: hdfs1
restart: always restart: always
expose: expose:

View File

@ -1,6 +1,7 @@
import os import os
import pytest import pytest
import uuid
import time import time
from helpers.cluster import ClickHouseCluster, is_arm from helpers.cluster import ClickHouseCluster, is_arm
from helpers.test_tools import TSV from helpers.test_tools import TSV
@ -31,13 +32,15 @@ def started_cluster():
def test_read_write_storage(started_cluster): def test_read_write_storage(started_cluster):
id = uuid.uuid4()
hdfs_api = started_cluster.hdfs_api hdfs_api = started_cluster.hdfs_api
filename = f"simple_storage_{id}"
node1.query("drop table if exists SimpleHDFSStorage SYNC") node1.query("drop table if exists SimpleHDFSStorage SYNC")
node1.query( node1.query(
"create table SimpleHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/simple_storage', 'TSV')" f"create table SimpleHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/{filename}', 'TSV')"
) )
node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)") node1.query("insert into SimpleHDFSStorage values (1, 'Mark', 72.53)")
assert hdfs_api.read_data("/simple_storage") == "1\tMark\t72.53\n" assert hdfs_api.read_data(f"/{filename}") == "1\tMark\t72.53\n"
assert node1.query("select * from SimpleHDFSStorage") == "1\tMark\t72.53\n" assert node1.query("select * from SimpleHDFSStorage") == "1\tMark\t72.53\n"
@ -92,6 +95,10 @@ def test_read_write_storage_with_globs(started_cluster):
print(ex) print(ex)
assert "in readonly mode" in str(ex) assert "in readonly mode" in str(ex)
node1.query("drop table HDFSStorageWithRange")
node1.query("drop table HDFSStorageWithEnum")
node1.query("drop table HDFSStorageWithQuestionMark")
node1.query("drop table HDFSStorageWithAsterisk")
def test_storage_with_multidirectory_glob(started_cluster): def test_storage_with_multidirectory_glob(started_cluster):
hdfs_api = started_cluster.hdfs_api hdfs_api = started_cluster.hdfs_api
@ -137,7 +144,6 @@ def test_read_write_table(started_cluster):
def test_write_table(started_cluster): def test_write_table(started_cluster):
hdfs_api = started_cluster.hdfs_api hdfs_api = started_cluster.hdfs_api
node1.query( node1.query(
"create table OtherHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/other_storage', 'TSV')" "create table OtherHDFSStorage (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/other_storage', 'TSV')"
) )
@ -148,7 +154,8 @@ def test_write_table(started_cluster):
result = "10\ttomas\t55.55\n11\tjack\t32.54\n" result = "10\ttomas\t55.55\n11\tjack\t32.54\n"
assert hdfs_api.read_data("/other_storage") == result assert hdfs_api.read_data("/other_storage") == result
assert node1.query("select * from OtherHDFSStorage order by id") == result assert node1.query("select * from OtherHDFSStorage order by id") == result
node1.query("truncate table OtherHDFSStorage")
node1.query("drop table OtherHDFSStorage")
def test_bad_hdfs_uri(started_cluster): def test_bad_hdfs_uri(started_cluster):
try: try:
@ -166,6 +173,7 @@ def test_bad_hdfs_uri(started_cluster):
print(ex) print(ex)
assert "Unable to connect to HDFS" in str(ex) assert "Unable to connect to HDFS" in str(ex)
node1.query("drop table BadStorage2")
try: try:
node1.query( node1.query(
"create table BadStorage3 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/<>', 'TSV')" "create table BadStorage3 (id UInt32, name String, weight Float64) ENGINE = HDFS('hdfs://hdfs1:9000/<>', 'TSV')"
@ -173,6 +181,7 @@ def test_bad_hdfs_uri(started_cluster):
except Exception as ex: except Exception as ex:
print(ex) print(ex)
assert "Unable to open HDFS file" in str(ex) assert "Unable to open HDFS file" in str(ex)
node1.query("drop table BadStorage3")
@pytest.mark.timeout(800) @pytest.mark.timeout(800)
@ -304,7 +313,8 @@ def test_write_gz_storage(started_cluster):
node1.query("insert into GZHDFSStorage values (1, 'Mark', 72.53)") node1.query("insert into GZHDFSStorage values (1, 'Mark', 72.53)")
assert hdfs_api.read_gzip_data("/storage.gz") == "1\tMark\t72.53\n" assert hdfs_api.read_gzip_data("/storage.gz") == "1\tMark\t72.53\n"
assert node1.query("select * from GZHDFSStorage") == "1\tMark\t72.53\n" assert node1.query("select * from GZHDFSStorage") == "1\tMark\t72.53\n"
node1.query("truncate table GZHDFSStorage")
node1.query("drop table GZHDFSStorage")
def test_write_gzip_storage(started_cluster): def test_write_gzip_storage(started_cluster):
hdfs_api = started_cluster.hdfs_api hdfs_api = started_cluster.hdfs_api
@ -315,6 +325,8 @@ def test_write_gzip_storage(started_cluster):
node1.query("insert into GZIPHDFSStorage values (1, 'Mark', 72.53)") node1.query("insert into GZIPHDFSStorage values (1, 'Mark', 72.53)")
assert hdfs_api.read_gzip_data("/gzip_storage") == "1\tMark\t72.53\n" assert hdfs_api.read_gzip_data("/gzip_storage") == "1\tMark\t72.53\n"
assert node1.query("select * from GZIPHDFSStorage") == "1\tMark\t72.53\n" assert node1.query("select * from GZIPHDFSStorage") == "1\tMark\t72.53\n"
node1.query("truncate table GZIPHDFSStorage")
node1.query("drop table GZIPHDFSStorage")
def test_virtual_columns(started_cluster): def test_virtual_columns(started_cluster):
@ -333,7 +345,7 @@ def test_virtual_columns(started_cluster):
) )
== expected == expected
) )
node1.query("drop table virtual_cols")
def test_read_files_with_spaces(started_cluster): def test_read_files_with_spaces(started_cluster):
hdfs_api = started_cluster.hdfs_api hdfs_api = started_cluster.hdfs_api
@ -354,6 +366,7 @@ def test_read_files_with_spaces(started_cluster):
) )
assert node1.query("select * from test order by id") == "1\n2\n3\n" assert node1.query("select * from test order by id") == "1\n2\n3\n"
fs.delete(dir, recursive=True) fs.delete(dir, recursive=True)
node1.query(f"drop table test")
def test_truncate_table(started_cluster): def test_truncate_table(started_cluster):
@ -375,47 +388,52 @@ def test_truncate_table(started_cluster):
def test_partition_by(started_cluster): def test_partition_by(started_cluster):
hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip)
id = uuid.uuid4()
table_format = "column1 UInt32, column2 UInt32, column3 UInt32" table_format = "column1 UInt32, column2 UInt32, column3 UInt32"
dir = f"partition_{id}"
fs.mkdirs(f"/{dir}/", permission=777)
file_name = "test_{_partition_id}" file_name = "test_{_partition_id}"
partition_by = "column3" partition_by = "column3"
values = "(1, 2, 3), (3, 2, 1), (1, 3, 2)" values = "(1, 2, 3), (3, 2, 1), (1, 3, 2)"
table_function = f"hdfs('hdfs://hdfs1:9000/{file_name}', 'TSV', '{table_format}')" table_function = f"hdfs('hdfs://hdfs1:9000/{dir}/{file_name}', 'TSV', '{table_format}')"
node1.query( node1.query(
f"insert into table function {table_function} PARTITION BY {partition_by} values {values}" f"insert into table function {table_function} PARTITION BY {partition_by} values {values}"
) )
result = node1.query( result = node1.query(
f"select * from hdfs('hdfs://hdfs1:9000/test_1', 'TSV', '{table_format}')" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_1', 'TSV', '{table_format}')"
) )
assert result.strip() == "3\t2\t1" assert result.strip() == "3\t2\t1"
result = node1.query( result = node1.query(
f"select * from hdfs('hdfs://hdfs1:9000/test_2', 'TSV', '{table_format}')" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_2', 'TSV', '{table_format}')"
) )
assert result.strip() == "1\t3\t2" assert result.strip() == "1\t3\t2"
result = node1.query( result = node1.query(
f"select * from hdfs('hdfs://hdfs1:9000/test_3', 'TSV', '{table_format}')" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_3', 'TSV', '{table_format}')"
) )
assert result.strip() == "1\t2\t3" assert result.strip() == "1\t2\t3"
file_name = "test2_{_partition_id}" file_name = "test2_{_partition_id}"
node1.query( node1.query(
f"create table p(column1 UInt32, column2 UInt32, column3 UInt32) engine = HDFS('hdfs://hdfs1:9000/{file_name}', 'TSV') partition by column3" f"create table p(column1 UInt32, column2 UInt32, column3 UInt32) engine = HDFS('hdfs://hdfs1:9000/{dir}/{file_name}', 'TSV') partition by column3"
) )
node1.query(f"insert into p values {values}") node1.query(f"insert into p values {values}")
result = node1.query( result = node1.query(
f"select * from hdfs('hdfs://hdfs1:9000/test2_1', 'TSV', '{table_format}')" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test2_1', 'TSV', '{table_format}')"
) )
assert result.strip() == "3\t2\t1" assert result.strip() == "3\t2\t1"
result = node1.query( result = node1.query(
f"select * from hdfs('hdfs://hdfs1:9000/test2_2', 'TSV', '{table_format}')" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test2_2', 'TSV', '{table_format}')"
) )
assert result.strip() == "1\t3\t2" assert result.strip() == "1\t3\t2"
result = node1.query( result = node1.query(
f"select * from hdfs('hdfs://hdfs1:9000/test2_3', 'TSV', '{table_format}')" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test2_3', 'TSV', '{table_format}')"
) )
assert result.strip() == "1\t2\t3" assert result.strip() == "1\t2\t3"
node1.query(f"drop table p")
fs.delete("/{dir}", recursive=True)
def test_seekable_formats(started_cluster): def test_seekable_formats(started_cluster):
@ -425,7 +443,7 @@ def test_seekable_formats(started_cluster):
f"hdfs('hdfs://hdfs1:9000/parquet', 'Parquet', 'a Int32, b String')" f"hdfs('hdfs://hdfs1:9000/parquet', 'Parquet', 'a Int32, b String')"
) )
node1.query( node1.query(
f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)" f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1"
) )
result = node1.query(f"SELECT count() FROM {table_function}") result = node1.query(f"SELECT count() FROM {table_function}")
@ -433,7 +451,7 @@ def test_seekable_formats(started_cluster):
table_function = f"hdfs('hdfs://hdfs1:9000/orc', 'ORC', 'a Int32, b String')" table_function = f"hdfs('hdfs://hdfs1:9000/orc', 'ORC', 'a Int32, b String')"
node1.query( node1.query(
f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000)" f"insert into table function {table_function} SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1"
) )
result = node1.query(f"SELECT count() FROM {table_function}") result = node1.query(f"SELECT count() FROM {table_function}")
assert int(result) == 5000000 assert int(result) == 5000000
@ -457,7 +475,7 @@ def test_read_table_with_default(started_cluster):
def test_schema_inference(started_cluster): def test_schema_inference(started_cluster):
node1.query( node1.query(
f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000)" f"insert into table function hdfs('hdfs://hdfs1:9000/native', 'Native', 'a Int32, b String') SELECT number, randomString(100) FROM numbers(5000000) SETTINGS hdfs_truncate_on_insert=1"
) )
result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')") result = node1.query(f"desc hdfs('hdfs://hdfs1:9000/native', 'Native')")
@ -476,7 +494,7 @@ def test_schema_inference(started_cluster):
result = node1.query(f"select count(*) from schema_inference") result = node1.query(f"select count(*) from schema_inference")
assert int(result) == 5000000 assert int(result) == 5000000
node1.query(f"drop table schema_inference")
def test_hdfsCluster(started_cluster): def test_hdfsCluster(started_cluster):
hdfs_api = started_cluster.hdfs_api hdfs_api = started_cluster.hdfs_api
@ -510,6 +528,7 @@ def test_hdfs_directory_not_exist(started_cluster):
assert "" == node1.query( assert "" == node1.query(
"select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1" "select * from HDFSStorageWithNotExistDir settings hdfs_ignore_file_doesnt_exist=1"
) )
node1.query("drop table HDFSStorageWithNotExistDir")
def test_overwrite(started_cluster): def test_overwrite(started_cluster):
@ -529,12 +548,16 @@ def test_overwrite(started_cluster):
result = node1.query(f"select count() from test_overwrite") result = node1.query(f"select count() from test_overwrite")
assert int(result) == 10 assert int(result) == 10
node1.query(f"truncate table test_overwrite")
node1.query(f"drop table test_overwrite")
def test_multiple_inserts(started_cluster): def test_multiple_inserts(started_cluster):
hdfs_api = started_cluster.hdfs_api fs = HdfsClient(hosts=started_cluster.hdfs_ip)
id = uuid.uuid4()
fs.mkdirs(f"/{id}/", permission=777)
table_function = f"hdfs('hdfs://hdfs1:9000/data_multiple_inserts', 'Parquet', 'a Int32, b String')" table_function = f"hdfs('hdfs://hdfs1:9000/{id}/data_multiple_inserts', 'Parquet', 'a Int32, b String')"
node1.query(f"create table test_multiple_inserts as {table_function}") node1.query(f"create table test_multiple_inserts as {table_function}")
node1.query( node1.query(
f"insert into test_multiple_inserts select number, randomString(100) from numbers(10)" f"insert into test_multiple_inserts select number, randomString(100) from numbers(10)"
@ -551,7 +574,7 @@ def test_multiple_inserts(started_cluster):
result = node1.query(f"drop table test_multiple_inserts") result = node1.query(f"drop table test_multiple_inserts")
table_function = f"hdfs('hdfs://hdfs1:9000/data_multiple_inserts.gz', 'Parquet', 'a Int32, b String')" table_function = f"hdfs('hdfs://hdfs1:9000/{id}/data_multiple_inserts.gz', 'Parquet', 'a Int32, b String')"
node1.query(f"create table test_multiple_inserts as {table_function}") node1.query(f"create table test_multiple_inserts as {table_function}")
node1.query( node1.query(
f"insert into test_multiple_inserts select number, randomString(100) FROM numbers(10)" f"insert into test_multiple_inserts select number, randomString(100) FROM numbers(10)"
@ -565,7 +588,7 @@ def test_multiple_inserts(started_cluster):
result = node1.query(f"select count() from test_multiple_inserts") result = node1.query(f"select count() from test_multiple_inserts")
assert int(result) == 60 assert int(result) == 60
node1.query(f"drop table test_multiple_inserts")
def test_format_detection(started_cluster): def test_format_detection(started_cluster):
node1.query( node1.query(
@ -574,6 +597,8 @@ def test_format_detection(started_cluster):
node1.query(f"insert into arrow_table select 1") node1.query(f"insert into arrow_table select 1")
result = node1.query(f"select * from hdfs('hdfs://hdfs1:9000/data.arrow')") result = node1.query(f"select * from hdfs('hdfs://hdfs1:9000/data.arrow')")
assert int(result) == 1 assert int(result) == 1
node1.query(f"truncate table arrow_table")
node1.query(f"drop table arrow_table")
def test_schema_inference_with_globs(started_cluster): def test_schema_inference_with_globs(started_cluster):
@ -618,6 +643,8 @@ def test_schema_inference_with_globs(started_cluster):
def test_insert_select_schema_inference(started_cluster): def test_insert_select_schema_inference(started_cluster):
fs = HdfsClient(hosts=started_cluster.hdfs_ip)
node1.query( node1.query(
f"insert into table function hdfs('hdfs://hdfs1:9000/test.native.zst') select toUInt64(1) as x" f"insert into table function hdfs('hdfs://hdfs1:9000/test.native.zst') select toUInt64(1) as x"
) )
@ -627,6 +654,7 @@ def test_insert_select_schema_inference(started_cluster):
result = node1.query(f"select * from hdfs('hdfs://hdfs1:9000/test.native.zst')") result = node1.query(f"select * from hdfs('hdfs://hdfs1:9000/test.native.zst')")
assert int(result) == 1 assert int(result) == 1
fs.delete('/test.native.zst')
def test_cluster_join(started_cluster): def test_cluster_join(started_cluster):
@ -967,11 +995,11 @@ def test_read_subcolumns(started_cluster):
node = started_cluster.instances["node1"] node = started_cluster.instances["node1"]
node.query( node.query(
f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.tsv', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) settings hdfs_truncate_on_insert=1"
) )
node.query( node.query(
f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3)" f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumns.jsonl', auto, 'a Tuple(b Tuple(c UInt32, d UInt32), e UInt32)') select ((1, 2), 3) settings hdfs_truncate_on_insert=1"
) )
res = node.query( res = node.query(
@ -1003,7 +1031,7 @@ def test_read_subcolumn_time(started_cluster):
node = started_cluster.instances["node1"] node = started_cluster.instances["node1"]
node.query( node.query(
f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32') select (42)" f"insert into function hdfs('hdfs://hdfs1:9000/test_subcolumn_time.tsv', auto, 'a UInt32') select (42) settings hdfs_truncate_on_insert=1"
) )
res = node.query( res = node.query(
@ -1014,91 +1042,101 @@ def test_read_subcolumn_time(started_cluster):
def test_union_schema_inference_mode(started_cluster): def test_union_schema_inference_mode(started_cluster):
id = uuid.uuid4()
fs = HdfsClient(hosts=started_cluster.hdfs_ip)
dir = f"union_{id}"
fs.mkdirs(f"/{dir}/", permission=777)
node = started_cluster.instances["node1"] node = started_cluster.instances["node1"]
node.query( node.query(
"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference1.jsonl') select 1 as a" f"insert into function hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference1.jsonl') select 1 as a"
) )
node.query( node.query(
"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference2.jsonl') select 2 as b" f"insert into function hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference2.jsonl') select 2 as b"
) )
node.query("system drop schema cache for hdfs") node.query("system drop schema cache for hdfs")
result = node.query( result = node.query(
"desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\n" assert result == "a\tNullable(Int64)\nb\tNullable(Int64)\n"
result = node.query( result = node.query(
"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%test_union_schema_inference%' order by file format TSV" f"select schema_inference_mode, splitByChar('/', source)[-1] as file, schema from system.schema_inference_cache where source like '%test_union_schema_inference%' order by file format TSV"
) )
assert ( assert (
result == "UNION\ttest_union_schema_inference1.jsonl\ta Nullable(Int64)\n" result == "UNION\ttest_union_schema_inference1.jsonl\ta Nullable(Int64)\n"
"UNION\ttest_union_schema_inference2.jsonl\tb Nullable(Int64)\n" "UNION\ttest_union_schema_inference2.jsonl\tb Nullable(Int64)\n"
) )
result = node.query( result = node.query(
"select * from hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference*.jsonl') order by tuple(*) settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "1\t\\N\n" "\\N\t2\n" assert result == "1\t\\N\n" "\\N\t2\n"
node.query(f"system drop schema cache for hdfs") node.query(f"system drop schema cache for hdfs")
result = node.query( result = node.query(
"desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference2.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "b\tNullable(Int64)\n" assert result == "b\tNullable(Int64)\n"
result = node.query( result = node.query(
"desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert result == "a\tNullable(Int64)\n" "b\tNullable(Int64)\n" assert result == "a\tNullable(Int64)\n" "b\tNullable(Int64)\n"
node.query( node.query(
f"insert into function hdfs('hdfs://hdfs1:9000/test_union_schema_inference3.jsonl', TSV) select 'Error'" f"insert into function hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference3.jsonl', TSV) select 'Error'"
) )
error = node.query_and_get_error( error = node.query_and_get_error(
"desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" f"desc hdfs('hdfs://hdfs1:9000/{dir}/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV"
) )
assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error
def test_format_detection(started_cluster): def test_format_detection(started_cluster):
node = started_cluster.instances["node1"] node = started_cluster.instances["node1"]
fs = HdfsClient(hosts=started_cluster.hdfs_ip)
id = uuid.uuid4()
dir = f"{id}"
fs.mkdirs(f"/{dir}/", permission=777)
node.query( node.query(
"insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0)" f"insert into function hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0)"
) )
node.query( node.query(
"insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10)" f"insert into function hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10)"
) )
expected_desc_result = node.query( expected_desc_result = node.query(
"desc hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow)" f"desc hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection1', JSONEachRow)"
) )
desc_result = node.query("desc hdfs('hdfs://hdfs1:9000/test_format_detection1')") desc_result = node.query(f"desc hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection1')")
assert expected_desc_result == desc_result assert expected_desc_result == desc_result
expected_result = node.query( expected_result = node.query(
"select * from hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow, 'x UInt64, y String') order by x, y" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection1', JSONEachRow, 'x UInt64, y String') order by x, y"
) )
result = node.query( result = node.query(
"select * from hdfs('hdfs://hdfs1:9000/test_format_detection1') order by x, y" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection1') order by x, y"
) )
assert expected_result == result assert expected_result == result
result = node.query( result = node.query(
"select * from hdfs('hdfs://hdfs1:9000/test_format_detection1', auto, 'x UInt64, y String') order by x, y" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection1', auto, 'x UInt64, y String') order by x, y"
) )
assert expected_result == result assert expected_result == result
result = node.query( result = node.query(
"select * from hdfs('hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection{{0,1}}') order by x, y"
) )
assert expected_result == result assert expected_result == result
@ -1106,25 +1144,25 @@ def test_format_detection(started_cluster):
node.query("system drop schema cache for hdfs") node.query("system drop schema cache for hdfs")
result = node.query( result = node.query(
"select * from hdfs('hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" f"select * from hdfs('hdfs://hdfs1:9000/{dir}/test_format_detection{{0,1}}') order by x, y"
) )
assert expected_result == result assert expected_result == result
result = node.query( result = node.query(
"select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" f"select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/{dir}/test_format_detection{{0,1}}') order by x, y"
) )
assert expected_result == result assert expected_result == result
result = node.query( result = node.query(
"select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}', auto, auto) order by x, y" f"select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/{dir}/test_format_detection{{0,1}}', auto, auto) order by x, y"
) )
assert expected_result == result assert expected_result == result
result = node.query( result = node.query(
"select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}', auto, 'x UInt64, y String') order by x, y" f"select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/{dir}/test_format_detection{{0,1}}', auto, 'x UInt64, y String') order by x, y"
) )
assert expected_result == result assert expected_result == result