2024-09-27 10:19:39 +00:00
import csv
2021-03-24 23:22:23 +00:00
import logging
import os
2023-01-09 12:30:32 +00:00
import shutil
import time
2024-09-27 10:19:39 +00:00
from email . errors import HeaderParseError
2021-03-24 23:22:23 +00:00
import pytest
2024-09-27 10:19:39 +00:00
2021-03-24 23:22:23 +00:00
from helpers . cluster import ClickHouseCluster
2023-05-04 07:56:00 +00:00
from helpers . mock_servers import start_mock_servers
2024-09-27 10:19:39 +00:00
from helpers . test_tools import TSV
2021-03-24 23:22:23 +00:00
logging . getLogger ( ) . setLevel ( logging . INFO )
logging . getLogger ( ) . addHandler ( logging . StreamHandler ( ) )
SCRIPT_DIR = os . path . dirname ( os . path . realpath ( __file__ ) )
S3_DATA = [
" data/clickhouse/part1.csv " ,
" data/clickhouse/part123.csv " ,
" data/database/part2.csv " ,
" data/database/partition675.csv " ,
]
2022-03-22 16:39:58 +00:00
2021-03-24 23:22:23 +00:00
def create_buckets_s3 ( cluster ) :
minio = cluster . minio_client
2023-01-09 12:30:32 +00:00
for file_number in range ( 100 ) :
file_name = f " data/generated/file_ { file_number } .csv "
os . makedirs ( os . path . join ( SCRIPT_DIR , " data/generated/ " ) , exist_ok = True )
S3_DATA . append ( file_name )
with open ( os . path . join ( SCRIPT_DIR , file_name ) , " w+ " , encoding = " utf-8 " ) as f :
# a String, b UInt64
data = [ ]
2023-02-16 15:28:47 +00:00
# Make all files a bit different
for number in range ( 100 + file_number ) :
2024-01-22 22:55:50 +00:00
data . append (
[ " str_ " + str ( number + file_number ) * 10 , number + file_number ]
)
2023-01-09 12:30:32 +00:00
writer = csv . writer ( f )
writer . writerows ( data )
2021-03-24 23:22:23 +00:00
for file in S3_DATA :
minio . fput_object (
bucket_name = cluster . minio_bucket ,
object_name = file ,
file_path = os . path . join ( SCRIPT_DIR , file ) ,
)
for obj in minio . list_objects ( cluster . minio_bucket , recursive = True ) :
print ( obj . object_name )
2023-05-04 07:56:00 +00:00
def run_s3_mocks ( started_cluster ) :
script_dir = os . path . join ( os . path . dirname ( __file__ ) , " s3_mocks " )
start_mock_servers (
started_cluster ,
script_dir ,
[
( " s3_mock.py " , " resolver " , " 8080 " ) ,
] ,
)
2021-03-24 23:22:23 +00:00
@pytest.fixture ( scope = " module " )
def started_cluster ( ) :
try :
cluster = ClickHouseCluster ( __file__ )
cluster . add_instance (
2023-01-09 12:30:32 +00:00
" s0_0_0 " ,
2023-05-16 14:54:52 +00:00
main_configs = [ " configs/cluster.xml " , " configs/named_collections.xml " ] ,
2023-06-14 11:45:53 +00:00
user_configs = [ " configs/users.xml " ] ,
2023-01-09 12:30:32 +00:00
macros = { " replica " : " node1 " , " shard " : " shard1 " } ,
with_minio = True ,
with_zookeeper = True ,
)
cluster . add_instance (
" s0_0_1 " ,
2023-05-16 14:54:52 +00:00
main_configs = [ " configs/cluster.xml " , " configs/named_collections.xml " ] ,
2023-06-15 10:33:24 +00:00
user_configs = [ " configs/users.xml " ] ,
2023-01-09 12:30:32 +00:00
macros = { " replica " : " replica2 " , " shard " : " shard1 " } ,
with_zookeeper = True ,
)
cluster . add_instance (
" s0_1_0 " ,
2023-05-16 14:54:52 +00:00
main_configs = [ " configs/cluster.xml " , " configs/named_collections.xml " ] ,
2023-06-15 10:33:24 +00:00
user_configs = [ " configs/users.xml " ] ,
2023-01-09 12:30:32 +00:00
macros = { " replica " : " replica1 " , " shard " : " shard2 " } ,
with_zookeeper = True ,
2021-03-24 23:22:23 +00:00
)
2022-03-22 16:39:58 +00:00
2021-03-24 23:22:23 +00:00
logging . info ( " Starting cluster... " )
cluster . start ( )
logging . info ( " Cluster started " )
create_buckets_s3 ( cluster )
2023-05-04 07:56:00 +00:00
run_s3_mocks ( cluster )
2021-03-24 23:22:23 +00:00
yield cluster
finally :
2023-01-09 12:30:32 +00:00
shutil . rmtree ( os . path . join ( SCRIPT_DIR , " data/generated/ " ) )
2021-03-24 23:22:23 +00:00
cluster . shutdown ( )
2021-03-26 15:33:14 +00:00
def test_select_all ( started_cluster ) :
2021-03-24 23:22:23 +00:00
node = started_cluster . instances [ " s0_0_0 " ]
2021-03-26 15:33:14 +00:00
pure_s3 = node . query (
"""
SELECT * from s3 (
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
' minio ' , ' minio123 ' , ' CSV ' ,
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
2021-03-26 15:33:14 +00:00
ORDER BY ( name , value , polygon ) """
)
2021-03-24 23:22:23 +00:00
# print(pure_s3)
2023-05-04 16:35:18 +00:00
s3_distributed = node . query (
2021-03-26 15:33:14 +00:00
"""
2021-04-12 21:52:16 +00:00
SELECT * from s3Cluster (
2023-01-09 12:30:32 +00:00
' cluster_simple ' ,
' http://minio1:9001/root/data/ { clickhouse,database}/* ' , ' minio ' , ' minio123 ' , ' CSV ' ,
2021-03-26 15:33:14 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) ORDER BY ( name , value , polygon ) """
)
2023-05-04 16:35:18 +00:00
# print(s3_distributed)
2021-03-24 23:22:23 +00:00
2023-05-04 16:35:18 +00:00
assert TSV ( pure_s3 ) == TSV ( s3_distributed )
2021-03-24 23:22:23 +00:00
2021-03-26 15:33:14 +00:00
2021-03-24 23:22:23 +00:00
def test_count ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
2021-03-26 15:33:14 +00:00
pure_s3 = node . query (
"""
SELECT count ( * ) from s3 (
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
' minio ' , ' minio123 ' , ' CSV ' ,
2021-03-26 15:33:14 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) """
)
2021-03-24 23:22:23 +00:00
# print(pure_s3)
2023-05-04 16:35:18 +00:00
s3_distributed = node . query (
2021-03-26 15:33:14 +00:00
"""
2021-04-12 21:52:16 +00:00
SELECT count ( * ) from s3Cluster (
2023-01-09 12:30:32 +00:00
' cluster_simple ' , ' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
2021-03-26 15:33:14 +00:00
' minio ' , ' minio123 ' , ' CSV ' ,
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) """
)
2023-05-04 16:35:18 +00:00
# print(s3_distributed)
2021-03-24 23:22:23 +00:00
2023-05-04 16:35:18 +00:00
assert TSV ( pure_s3 ) == TSV ( s3_distributed )
2021-03-26 15:33:14 +00:00
2022-04-27 23:32:49 +00:00
def test_count_macro ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
s3_macro = node . query (
"""
SELECT count ( * ) from s3Cluster (
' {default_cluster_macro} ' , ' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
' minio ' , ' minio123 ' , ' CSV ' ,
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) """
)
2023-05-04 16:35:18 +00:00
# print(s3_distributed)
s3_distributed = node . query (
2022-04-27 23:32:49 +00:00
"""
SELECT count ( * ) from s3Cluster (
' cluster_simple ' , ' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
' minio ' , ' minio123 ' , ' CSV ' ,
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) """
)
2023-05-04 16:35:18 +00:00
# print(s3_distributed)
2022-04-27 23:32:49 +00:00
2023-05-04 16:35:18 +00:00
assert TSV ( s3_macro ) == TSV ( s3_distributed )
2022-04-27 23:32:49 +00:00
2021-04-08 19:00:39 +00:00
def test_union_all ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
pure_s3 = node . query (
"""
2021-04-12 17:48:16 +00:00
SELECT * FROM
(
SELECT * from s3 (
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
' minio ' , ' minio123 ' , ' CSV ' ,
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
2021-04-12 17:48:16 +00:00
UNION ALL
SELECT * from s3 (
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
' minio ' , ' minio123 ' , ' CSV ' ,
2021-04-12 17:48:16 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
)
2021-04-08 19:00:39 +00:00
ORDER BY ( name , value , polygon )
"""
)
# print(pure_s3)
2023-05-04 16:35:18 +00:00
s3_distributed = node . query (
2021-04-08 19:00:39 +00:00
"""
2021-04-12 17:48:16 +00:00
SELECT * FROM
(
2021-04-12 21:52:16 +00:00
SELECT * from s3Cluster (
2023-01-09 12:30:32 +00:00
' cluster_simple ' ,
' http://minio1:9001/root/data/ { clickhouse,database}/* ' , ' minio ' , ' minio123 ' , ' CSV ' ,
2021-04-12 17:48:16 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
UNION ALL
2021-04-12 21:52:16 +00:00
SELECT * from s3Cluster (
2023-01-09 12:30:32 +00:00
' cluster_simple ' ,
' http://minio1:9001/root/data/ { clickhouse,database}/* ' , ' minio ' , ' minio123 ' , ' CSV ' ,
2021-04-12 17:48:16 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
)
ORDER BY ( name , value , polygon )
2021-04-10 14:58:29 +00:00
"""
)
2023-05-04 16:35:18 +00:00
# print(s3_distributed)
2021-04-08 19:00:39 +00:00
2023-05-04 16:35:18 +00:00
assert TSV ( pure_s3 ) == TSV ( s3_distributed )
2021-04-08 19:00:39 +00:00
2021-03-26 15:33:14 +00:00
def test_wrong_cluster ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
error = node . query_and_get_error (
"""
2021-04-12 21:52:16 +00:00
SELECT count ( * ) from s3Cluster (
2021-04-08 19:00:39 +00:00
' non_existent_cluster ' ,
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
2021-04-08 19:00:39 +00:00
' minio ' , ' minio123 ' , ' CSV ' , ' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
UNION ALL
2021-04-12 21:52:16 +00:00
SELECT count ( * ) from s3Cluster (
2021-03-26 15:33:14 +00:00
' non_existent_cluster ' ,
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/ { clickhouse,database}/* ' ,
2022-03-31 01:28:07 +00:00
' minio ' , ' minio123 ' , ' CSV ' , ' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
"""
2021-03-26 15:33:14 +00:00
)
2022-03-22 16:39:58 +00:00
2021-03-26 15:33:14 +00:00
assert " not found " in error
2022-03-30 08:19:16 +00:00
def test_ambiguous_join ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
result = node . query (
2022-03-31 01:28:07 +00:00
"""
2022-03-30 08:19:16 +00:00
SELECT l . name , r . value from s3Cluster (
2023-01-09 12:30:32 +00:00
' cluster_simple ' ,
' http://minio1:9001/root/data/ { clickhouse,database}/* ' , ' minio ' , ' minio123 ' , ' CSV ' ,
2022-03-30 08:19:16 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) as l
JOIN s3Cluster (
2023-01-09 12:30:32 +00:00
' cluster_simple ' ,
' http://minio1:9001/root/data/ { clickhouse,database}/* ' , ' minio ' , ' minio123 ' , ' CSV ' ,
2022-03-30 08:19:16 +00:00
' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' ) as r
ON l . name = r . name
"""
)
assert " AMBIGUOUS_COLUMN_NAME " not in result
2022-11-11 16:03:36 +00:00
def test_skip_unavailable_shards ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
result = node . query (
"""
SELECT count ( * ) from s3Cluster (
' cluster_non_existent_port ' ,
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/clickhouse/part1.csv ' ,
2022-11-11 16:03:36 +00:00
' minio ' , ' minio123 ' , ' CSV ' , ' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
SETTINGS skip_unavailable_shards = 1
"""
)
assert result == " 10 \n "
2023-02-23 09:05:51 +00:00
def test_unset_skip_unavailable_shards ( started_cluster ) :
# Although skip_unavailable_shards is not set, cluster table functions should always skip unavailable shards.
2022-11-11 16:03:36 +00:00
node = started_cluster . instances [ " s0_0_0 " ]
2023-02-23 09:05:51 +00:00
result = node . query (
2022-11-11 16:03:36 +00:00
"""
SELECT count ( * ) from s3Cluster (
' cluster_non_existent_port ' ,
2023-01-09 12:30:32 +00:00
' http://minio1:9001/root/data/clickhouse/part1.csv ' ,
2022-11-11 16:03:36 +00:00
' minio ' , ' minio123 ' , ' CSV ' , ' name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64))) ' )
"""
)
2023-02-23 09:05:51 +00:00
assert result == " 10 \n "
2023-01-09 12:30:32 +00:00
def test_distributed_insert_select_with_replicated ( started_cluster ) :
first_replica_first_shard = started_cluster . instances [ " s0_0_0 " ]
second_replica_first_shard = started_cluster . instances [ " s0_0_1 " ]
first_replica_first_shard . query (
""" DROP TABLE IF EXISTS insert_select_replicated_local ON CLUSTER ' first_shard ' SYNC; """
)
first_replica_first_shard . query (
"""
CREATE TABLE insert_select_replicated_local ON CLUSTER ' first_shard ' ( a String , b UInt64 )
ENGINE = ReplicatedMergeTree ( ' /clickhouse/tables/ {shard} /insert_select_with_replicated ' , ' {replica} ' )
ORDER BY ( a , b ) ;
"""
)
for replica in [ first_replica_first_shard , second_replica_first_shard ] :
replica . query (
"""
SYSTEM STOP FETCHES ;
"""
)
replica . query (
"""
SYSTEM STOP MERGES ;
"""
)
first_replica_first_shard . query (
"""
INSERT INTO insert_select_replicated_local SELECT * FROM s3Cluster (
' first_shard ' ,
' http://minio1:9001/root/data/generated/*.csv ' , ' minio ' , ' minio123 ' , ' CSV ' , ' a String, b UInt64 '
) SETTINGS parallel_distributed_insert_select = 1 ;
"""
)
for replica in [ first_replica_first_shard , second_replica_first_shard ] :
replica . query (
"""
SYSTEM FLUSH LOGS ;
"""
)
assert (
int (
second_replica_first_shard . query (
""" SELECT count(*) FROM system.query_log WHERE not is_initial_query and query ilike ' %s 3Cluster % ' ; """
) . strip ( )
)
!= 0
)
# Check whether we inserted at least something
assert (
int (
second_replica_first_shard . query (
""" SELECT count(*) FROM insert_select_replicated_local; """
) . strip ( )
)
!= 0
)
first_replica_first_shard . query (
""" DROP TABLE IF EXISTS insert_select_replicated_local ON CLUSTER ' first_shard ' SYNC; """
)
2023-02-15 14:30:43 +00:00
def test_parallel_distributed_insert_select_with_schema_inference ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
node . query (
""" DROP TABLE IF EXISTS parallel_insert_select ON CLUSTER ' first_shard ' SYNC; """
)
node . query (
"""
CREATE TABLE parallel_insert_select ON CLUSTER ' first_shard ' ( a String , b UInt64 )
ENGINE = ReplicatedMergeTree ( ' /clickhouse/tables/ {shard} /insert_select_with_replicated ' , ' {replica} ' )
ORDER BY ( a , b ) ;
"""
)
node . query (
"""
INSERT INTO parallel_insert_select SELECT * FROM s3Cluster (
' first_shard ' ,
' http://minio1:9001/root/data/generated/*.csv ' , ' minio ' , ' minio123 ' , ' CSV '
) SETTINGS parallel_distributed_insert_select = 1 , use_structure_from_insertion_table_in_table_functions = 0 ;
"""
)
node . query ( " SYSTEM SYNC REPLICA parallel_insert_select " )
actual_count = int (
node . query (
" SELECT count() FROM s3( ' http://minio1:9001/root/data/generated/*.csv ' , ' minio ' , ' minio123 ' , ' CSV ' , ' a String, b UInt64 ' ) "
)
)
count = int ( node . query ( " SELECT count() FROM parallel_insert_select " ) )
assert count == actual_count
2023-05-04 07:56:00 +00:00
def test_cluster_with_header ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
assert (
node . query (
" SELECT * from s3( ' http://resolver:8080/bucket/key.csv ' , headers(MyCustomHeader = ' SomeValue ' )) "
)
== " SomeValue \n "
)
assert (
node . query (
" SELECT * from s3( ' http://resolver:8080/bucket/key.csv ' , headers(MyCustomHeader = ' SomeValue ' ), ' CSV ' ) "
)
== " SomeValue \n "
)
assert (
node . query (
" SELECT * from s3Cluster( ' cluster_simple ' , ' http://resolver:8080/bucket/key.csv ' , headers(MyCustomHeader = ' SomeValue ' )) "
)
== " SomeValue \n "
)
assert (
node . query (
" SELECT * from s3Cluster( ' cluster_simple ' , ' http://resolver:8080/bucket/key.csv ' , headers(MyCustomHeader = ' SomeValue ' ), ' CSV ' ) "
)
== " SomeValue \n "
)
2023-05-16 14:54:52 +00:00
def test_cluster_with_named_collection ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
pure_s3 = node . query ( """ SELECT * from s3(test_s3) ORDER BY (c1, c2, c3) """ )
s3_cluster = node . query (
""" SELECT * from s3Cluster(cluster_simple, test_s3) ORDER BY (c1, c2, c3) """
)
assert TSV ( pure_s3 ) == TSV ( s3_cluster )
s3_cluster = node . query (
""" SELECT * from s3Cluster(cluster_simple, test_s3, structure= ' auto ' ) ORDER BY (c1, c2, c3) """
)
assert TSV ( pure_s3 ) == TSV ( s3_cluster )
2024-01-22 22:55:50 +00:00
def test_cluster_format_detection ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
expected_desc_result = node . query (
" desc s3( ' http://minio1:9001/root/data/generated/* ' , ' minio ' , ' minio123 ' , ' CSV ' ) "
)
desc_result = node . query (
" desc s3( ' http://minio1:9001/root/data/generated/* ' , ' minio ' , ' minio123 ' ) "
)
assert expected_desc_result == desc_result
expected_result = node . query (
" SELECT * FROM s3( ' http://minio1:9001/root/data/generated/* ' , ' minio ' , ' minio123 ' , ' CSV ' , ' a String, b UInt64 ' ) order by a, b "
)
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, ' http://minio1:9001/root/data/generated/* ' , ' minio ' , ' minio123 ' ) order by c1, c2 "
)
assert result == expected_result
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, ' http://minio1:9001/root/data/generated/* ' , ' minio ' , ' minio123 ' , auto, ' a String, b UInt64 ' ) order by a, b "
)
assert result == expected_result
2024-09-02 12:50:12 +00:00
2024-09-02 20:11:00 +00:00
2024-09-02 12:50:12 +00:00
def test_cluster_default_expression ( started_cluster ) :
node = started_cluster . instances [ " s0_0_0 " ]
2024-09-02 20:11:00 +00:00
node . query (
" insert into function s3( ' http://minio1:9001/root/data/data1 ' , ' minio ' , ' minio123 ' , JSONEachRow) select 1 as id settings s3_truncate_on_insert=1 "
)
node . query (
" insert into function s3( ' http://minio1:9001/root/data/data2 ' , ' minio ' , ' minio123 ' , JSONEachRow) select * from numbers(0) settings s3_truncate_on_insert=1 "
)
node . query (
" insert into function s3( ' http://minio1:9001/root/data/data3 ' , ' minio ' , ' minio123 ' , JSONEachRow) select 2 as id settings s3_truncate_on_insert=1 "
)
2024-09-02 12:50:12 +00:00
expected_result = node . query (
" SELECT * FROM s3( ' http://minio1:9001/root/data/data { 1,2,3} ' , ' minio ' , ' minio123 ' , ' JSONEachRow ' , ' id UInt32, date Date DEFAULT 18262 ' ) order by id "
)
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, ' http://minio1:9001/root/data/data { 1,2,3} ' , ' minio ' , ' minio123 ' , ' JSONEachRow ' , ' id UInt32, date Date DEFAULT 18262 ' ) order by id "
)
assert result == expected_result
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, ' http://minio1:9001/root/data/data { 1,2,3} ' , ' minio ' , ' minio123 ' , ' auto ' , ' id UInt32, date Date DEFAULT 18262 ' ) order by id "
)
assert result == expected_result
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, ' http://minio1:9001/root/data/data { 1,2,3} ' , ' minio ' , ' minio123 ' , ' JSONEachRow ' , ' id UInt32, date Date DEFAULT 18262 ' , ' auto ' ) order by id "
)
assert result == expected_result
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, ' http://minio1:9001/root/data/data { 1,2,3} ' , ' minio ' , ' minio123 ' , ' auto ' , ' id UInt32, date Date DEFAULT 18262 ' , ' auto ' ) order by id "
)
assert result == expected_result
result = node . query (
" SELECT * FROM s3Cluster(cluster_simple, test_s3_with_default) order by id "
)
2024-09-02 20:11:00 +00:00
assert result == expected_result