diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index a4ed94c815b..60b0e8792d7 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -678,6 +678,44 @@ def test_s3_glob_scheherazade(started_cluster): assert run_query(instance, query).splitlines() == ["1001\t1001\t1001\t1001"] +# a bit modified version of scheherazade test +# checks e.g. `prefix{1,2}/file*.csv`, where there are more than 1000 files under each of prefix1, prefix2. +def test_s3_glob_many_objects_under_selection(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] # type: ClickHouseInstance + table_format = "column1 UInt32, column2 UInt32, column3 UInt32" + values = "(1, 1, 1)" + jobs = [] + for file_num in range(1100): + + def create_files(file_num): + for folder_num in range(1, 3): + path = f"folder{folder_num}/file{file_num}.csv" + query = "insert into table function s3('http://{}:{}/{}/{}', 'CSV', '{}') values {}".format( + started_cluster.minio_ip, + MINIO_INTERNAL_PORT, + bucket, + path, + table_format, + values, + ) + run_query(instance, query) + + jobs.append(threading.Thread(target=create_files, args=(file_num,))) + jobs[-1].start() + + for job in jobs: + job.join() + + query = "select count(), sum(column1), sum(column2), sum(column3) from s3('http://{}:{}/{}/folder{{1,2}}/file*.csv', 'CSV', '{}')".format( + started_cluster.minio_redirect_host, + started_cluster.minio_redirect_port, + bucket, + table_format, + ) + assert run_query(instance, query).splitlines() == ["2200\t2200\t2200\t2200"] + + def run_s3_mocks(started_cluster): script_dir = os.path.join(os.path.dirname(__file__), "s3_mocks") start_mock_servers(