2024-11-06 19:42:50 +00:00
|
|
|
import glob
|
|
|
|
import json
|
|
|
|
import logging
|
|
|
|
import os
|
2024-11-13 19:12:52 +00:00
|
|
|
import random
|
2024-11-06 19:42:50 +00:00
|
|
|
import time
|
|
|
|
import uuid
|
2024-11-13 19:12:52 +00:00
|
|
|
from datetime import datetime, timedelta
|
2024-11-06 20:05:08 +00:00
|
|
|
|
2024-11-13 19:12:52 +00:00
|
|
|
import pyarrow as pa
|
2024-11-06 19:42:50 +00:00
|
|
|
import pytest
|
2024-11-06 20:05:08 +00:00
|
|
|
import requests
|
2024-11-12 15:41:39 +00:00
|
|
|
import urllib3
|
2024-11-12 17:24:25 +00:00
|
|
|
from minio import Minio
|
2024-11-12 15:41:39 +00:00
|
|
|
from pyiceberg.catalog import load_catalog
|
2024-11-12 17:24:25 +00:00
|
|
|
from pyiceberg.partitioning import PartitionField, PartitionSpec
|
2024-11-12 15:41:39 +00:00
|
|
|
from pyiceberg.schema import Schema
|
2024-11-12 17:24:25 +00:00
|
|
|
from pyiceberg.table.sorting import SortField, SortOrder
|
|
|
|
from pyiceberg.transforms import DayTransform, IdentityTransform
|
2024-11-12 15:41:39 +00:00
|
|
|
from pyiceberg.types import (
|
|
|
|
DoubleType,
|
2024-11-12 17:24:25 +00:00
|
|
|
FloatType,
|
2024-11-12 15:41:39 +00:00
|
|
|
NestedField,
|
2024-11-12 17:24:25 +00:00
|
|
|
StringType,
|
2024-11-12 15:41:39 +00:00
|
|
|
StructType,
|
2024-11-12 17:24:25 +00:00
|
|
|
TimestampType,
|
2024-11-12 15:41:39 +00:00
|
|
|
)
|
2024-11-13 19:12:52 +00:00
|
|
|
|
2024-11-06 19:42:50 +00:00
|
|
|
from helpers.cluster import ClickHouseCluster, ClickHouseInstance, is_arm
|
2024-11-06 20:05:08 +00:00
|
|
|
from helpers.s3_tools import get_file_contents, list_s3_objects, prepare_s3_bucket
|
2024-11-12 17:24:25 +00:00
|
|
|
from helpers.test_tools import TSV, csv_compare
|
2024-11-06 19:42:50 +00:00
|
|
|
|
|
|
|
BASE_URL = "http://rest:8181/v1"
|
2024-11-13 14:26:32 +00:00
|
|
|
BASE_URL_LOCAL = "http://localhost:8182/v1"
|
|
|
|
BASE_URL_LOCAL_RAW = "http://localhost:8182"
|
2024-11-06 19:42:50 +00:00
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
CATALOG_NAME = "demo"
|
|
|
|
|
|
|
|
DEFAULT_SCHEMA = Schema(
|
2024-11-13 18:58:04 +00:00
|
|
|
NestedField(
|
|
|
|
field_id=1, name="datetime", field_type=TimestampType(), required=False
|
|
|
|
),
|
|
|
|
NestedField(field_id=2, name="symbol", field_type=StringType(), required=False),
|
|
|
|
NestedField(field_id=3, name="bid", field_type=DoubleType(), required=False),
|
2024-11-12 17:17:10 +00:00
|
|
|
NestedField(field_id=4, name="ask", field_type=DoubleType(), required=False),
|
|
|
|
NestedField(
|
|
|
|
field_id=5,
|
|
|
|
name="details",
|
|
|
|
field_type=StructType(
|
|
|
|
NestedField(
|
|
|
|
field_id=4,
|
|
|
|
name="created_by",
|
|
|
|
field_type=StringType(),
|
|
|
|
required=False,
|
|
|
|
),
|
|
|
|
),
|
|
|
|
required=False,
|
|
|
|
),
|
|
|
|
)
|
2024-11-13 18:58:04 +00:00
|
|
|
|
2024-11-14 12:56:09 +00:00
|
|
|
DEFAULT_CREATE_TABLE = "CREATE TABLE {}.`{}.{}`\\n(\\n `datetime` Nullable(DateTime64(6)),\\n `symbol` Nullable(String),\\n `bid` Nullable(Float64),\\n `ask` Nullable(Float64),\\n `details` Tuple(created_by Nullable(String))\\n)\\nENGINE = Iceberg(\\'http://minio:9000/warehouse/data/\\', \\'minio\\', \\'[HIDDEN]\\')\n"
|
2024-11-13 18:58:04 +00:00
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
DEFAULT_PARTITION_SPEC = PartitionSpec(
|
|
|
|
PartitionField(
|
|
|
|
source_id=1, field_id=1000, transform=DayTransform(), name="datetime_day"
|
2024-11-06 19:42:50 +00:00
|
|
|
)
|
2024-11-12 17:17:10 +00:00
|
|
|
)
|
2024-11-13 18:58:04 +00:00
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
DEFAULT_SORT_ORDER = SortOrder(SortField(source_id=2, transform=IdentityTransform()))
|
2024-11-06 19:42:50 +00:00
|
|
|
|
|
|
|
|
|
|
|
def list_namespaces():
|
|
|
|
response = requests.get(f"{BASE_URL_LOCAL}/namespaces")
|
|
|
|
if response.status_code == 200:
|
|
|
|
return response.json()
|
|
|
|
else:
|
|
|
|
raise Exception(f"Failed to list namespaces: {response.status_code}")
|
|
|
|
|
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
def load_catalog_impl(started_cluster):
|
2024-11-12 17:17:10 +00:00
|
|
|
return load_catalog(
|
|
|
|
CATALOG_NAME,
|
|
|
|
**{
|
|
|
|
"uri": BASE_URL_LOCAL_RAW,
|
|
|
|
"type": "rest",
|
2024-11-13 18:58:04 +00:00
|
|
|
"s3.endpoint": f"http://localhost:9002",
|
2024-11-12 17:17:10 +00:00
|
|
|
"s3.access-key-id": "minio",
|
|
|
|
"s3.secret-access-key": "minio123",
|
2024-11-06 19:42:50 +00:00
|
|
|
},
|
2024-11-12 17:17:10 +00:00
|
|
|
)
|
|
|
|
|
2024-11-06 19:42:50 +00:00
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
def create_table(
|
|
|
|
catalog,
|
|
|
|
namespace,
|
|
|
|
table,
|
|
|
|
schema=DEFAULT_SCHEMA,
|
|
|
|
partition_spec=DEFAULT_PARTITION_SPEC,
|
|
|
|
sort_order=DEFAULT_SORT_ORDER,
|
|
|
|
):
|
2024-11-13 18:58:04 +00:00
|
|
|
return catalog.create_table(
|
2024-11-12 17:17:10 +00:00
|
|
|
identifier=f"{namespace}.{table}",
|
|
|
|
schema=schema,
|
2024-11-13 18:58:04 +00:00
|
|
|
location=f"s3://warehouse/data",
|
2024-11-12 17:17:10 +00:00
|
|
|
partition_spec=partition_spec,
|
|
|
|
sort_order=sort_order,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
def generate_record():
|
|
|
|
return {
|
|
|
|
"datetime": datetime.now(),
|
|
|
|
"symbol": str("kek"),
|
|
|
|
"bid": round(random.uniform(100, 200), 2),
|
|
|
|
"ask": round(random.uniform(200, 300), 2),
|
|
|
|
"details": {"created_by": "Alice Smith"},
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
def create_clickhouse_iceberg_database(started_cluster, node, name):
|
|
|
|
node.query(
|
|
|
|
f"""
|
|
|
|
DROP DATABASE IF EXISTS {name};
|
|
|
|
CREATE DATABASE {name} ENGINE = Iceberg('{BASE_URL}', 'minio', 'minio123')
|
2024-11-13 18:58:04 +00:00
|
|
|
SETTINGS catalog_type = 'rest', storage_endpoint = 'http://minio:9000/'
|
2024-11-12 17:17:10 +00:00
|
|
|
"""
|
2024-11-06 19:42:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
def print_objects():
|
|
|
|
minio_client = Minio(
|
|
|
|
f"localhost:9002",
|
|
|
|
access_key="minio",
|
|
|
|
secret_key="minio123",
|
|
|
|
secure=False,
|
|
|
|
http_client=urllib3.PoolManager(cert_reqs="CERT_NONE"),
|
|
|
|
)
|
|
|
|
|
|
|
|
objects = list(minio_client.list_objects("warehouse", "", recursive=True))
|
|
|
|
names = [x.object_name for x in objects]
|
|
|
|
names.sort()
|
|
|
|
for name in names:
|
|
|
|
print(f"Found object: {name}")
|
|
|
|
|
|
|
|
|
2024-11-06 19:42:50 +00:00
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
def started_cluster():
|
|
|
|
try:
|
2024-11-12 15:41:39 +00:00
|
|
|
cluster = ClickHouseCluster(__file__)
|
2024-11-06 19:42:50 +00:00
|
|
|
cluster.add_instance(
|
|
|
|
"node1",
|
|
|
|
main_configs=[],
|
|
|
|
user_configs=[],
|
|
|
|
stay_alive=True,
|
2024-11-12 15:41:39 +00:00
|
|
|
with_iceberg_catalog=True,
|
2024-11-06 19:42:50 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
logging.info("Starting cluster...")
|
|
|
|
cluster.start()
|
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
# TODO: properly wait for container
|
|
|
|
time.sleep(10)
|
2024-11-06 19:42:50 +00:00
|
|
|
|
|
|
|
yield cluster
|
|
|
|
|
|
|
|
finally:
|
|
|
|
cluster.shutdown()
|
|
|
|
|
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
def test_list_tables(started_cluster):
|
2024-11-12 17:17:10 +00:00
|
|
|
node = started_cluster.instances["node1"]
|
2024-11-06 19:42:50 +00:00
|
|
|
|
2024-11-14 12:56:09 +00:00
|
|
|
root_namespace = f"clickhouse_{uuid.uuid4()}"
|
|
|
|
namespace_1 = f"{root_namespace}.testA.A"
|
|
|
|
namespace_2 = f"{root_namespace}.testB.B"
|
2024-11-12 17:17:10 +00:00
|
|
|
namespace_1_tables = ["tableA", "tableB"]
|
|
|
|
namespace_2_tables = ["tableC", "tableD"]
|
2024-11-06 19:42:50 +00:00
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
catalog = load_catalog_impl(started_cluster)
|
2024-11-06 19:42:50 +00:00
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
for namespace in [namespace_1, namespace_2]:
|
|
|
|
catalog.create_namespace(namespace)
|
2024-11-12 15:41:39 +00:00
|
|
|
|
2024-11-14 12:56:09 +00:00
|
|
|
found = False
|
|
|
|
for namespace_list in list_namespaces()["namespaces"]:
|
|
|
|
if root_namespace == namespace_list[0]:
|
|
|
|
found = True
|
|
|
|
break
|
|
|
|
assert found
|
|
|
|
|
|
|
|
found = False
|
|
|
|
for namespace_list in catalog.list_namespaces():
|
|
|
|
if root_namespace == namespace_list[0]:
|
|
|
|
found = True
|
|
|
|
break
|
|
|
|
assert found
|
2024-11-12 15:41:39 +00:00
|
|
|
|
2024-11-12 17:17:10 +00:00
|
|
|
for namespace in [namespace_1, namespace_2]:
|
|
|
|
assert len(catalog.list_tables(namespace)) == 0
|
|
|
|
|
|
|
|
create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME)
|
|
|
|
|
|
|
|
tables_list = ""
|
|
|
|
for table in namespace_1_tables:
|
|
|
|
create_table(catalog, namespace_1, table)
|
|
|
|
if len(tables_list) > 0:
|
|
|
|
tables_list += "\n"
|
|
|
|
tables_list += f"{namespace_1}.{table}"
|
|
|
|
|
|
|
|
for table in namespace_2_tables:
|
|
|
|
create_table(catalog, namespace_2, table)
|
|
|
|
if len(tables_list) > 0:
|
|
|
|
tables_list += "\n"
|
|
|
|
tables_list += f"{namespace_2}.{table}"
|
|
|
|
|
|
|
|
assert (
|
|
|
|
tables_list
|
|
|
|
== node.query(
|
2024-11-14 12:56:09 +00:00
|
|
|
f"SELECT name FROM system.tables WHERE database = '{CATALOG_NAME}' and name ILIKE '{root_namespace}%' ORDER BY name"
|
2024-11-12 17:17:10 +00:00
|
|
|
).strip()
|
2024-11-12 15:41:39 +00:00
|
|
|
)
|
2024-11-12 17:17:10 +00:00
|
|
|
node.restart_clickhouse()
|
|
|
|
assert (
|
|
|
|
tables_list
|
|
|
|
== node.query(
|
2024-11-14 12:56:09 +00:00
|
|
|
f"SELECT name FROM system.tables WHERE database = '{CATALOG_NAME}' and name ILIKE '{root_namespace}%' ORDER BY name"
|
2024-11-12 17:17:10 +00:00
|
|
|
).strip()
|
2024-11-12 15:41:39 +00:00
|
|
|
)
|
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
expected = DEFAULT_CREATE_TABLE.format(CATALOG_NAME, namespace_2, "tableC")
|
2024-11-12 17:17:10 +00:00
|
|
|
assert expected == node.query(
|
|
|
|
f"SHOW CREATE TABLE {CATALOG_NAME}.`{namespace_2}.tableC`"
|
|
|
|
)
|
2024-11-12 15:41:39 +00:00
|
|
|
|
2024-11-06 19:42:50 +00:00
|
|
|
|
2024-11-13 18:58:04 +00:00
|
|
|
def test_many_namespaces(started_cluster):
|
2024-11-12 17:17:10 +00:00
|
|
|
node = started_cluster.instances["node1"]
|
2024-11-14 12:56:09 +00:00
|
|
|
root_namespace_1 = f"A_{uuid.uuid4()}"
|
|
|
|
root_namespace_2 = f"B_{uuid.uuid4()}"
|
2024-11-12 17:24:25 +00:00
|
|
|
namespaces = [
|
2024-11-14 12:56:09 +00:00
|
|
|
f"{root_namespace_1}",
|
|
|
|
f"{root_namespace_1}.B.C",
|
|
|
|
f"{root_namespace_1}.B.C.D",
|
|
|
|
f"{root_namespace_1}.B.C.D.E",
|
|
|
|
f"{root_namespace_2}",
|
|
|
|
f"{root_namespace_2}.C",
|
|
|
|
f"{root_namespace_2}.CC",
|
2024-11-12 17:24:25 +00:00
|
|
|
]
|
2024-11-15 10:41:54 +00:00
|
|
|
tables = ["A", "B", "C"]
|
2024-11-13 18:58:04 +00:00
|
|
|
catalog = load_catalog_impl(started_cluster)
|
2024-11-12 17:17:10 +00:00
|
|
|
|
|
|
|
for namespace in namespaces:
|
|
|
|
catalog.create_namespace(namespace)
|
|
|
|
for table in tables:
|
|
|
|
create_table(catalog, namespace, table)
|
|
|
|
|
|
|
|
create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME)
|
|
|
|
|
|
|
|
for namespace in namespaces:
|
|
|
|
for table in tables:
|
|
|
|
table_name = f"{namespace}.{table}"
|
2024-11-12 17:24:25 +00:00
|
|
|
assert int(
|
|
|
|
node.query(
|
|
|
|
f"SELECT count() FROM system.tables WHERE database = '{CATALOG_NAME}' and name = '{table_name}'"
|
|
|
|
)
|
|
|
|
)
|
2024-11-13 18:58:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
def test_select(started_cluster):
|
|
|
|
node = started_cluster.instances["node1"]
|
|
|
|
|
2024-11-14 12:56:09 +00:00
|
|
|
test_ref = f"test_list_tables_{uuid.uuid4()}"
|
2024-11-13 18:58:04 +00:00
|
|
|
table_name = f"{test_ref}_table"
|
|
|
|
root_namespace = f"{test_ref}_namespace"
|
|
|
|
|
|
|
|
namespace = f"{root_namespace}.A.B.C"
|
|
|
|
namespaces_to_create = [
|
|
|
|
root_namespace,
|
|
|
|
f"{root_namespace}.A",
|
|
|
|
f"{root_namespace}.A.B",
|
|
|
|
f"{root_namespace}.A.B.C",
|
|
|
|
]
|
|
|
|
|
|
|
|
catalog = load_catalog_impl(started_cluster)
|
|
|
|
|
|
|
|
for namespace in namespaces_to_create:
|
|
|
|
catalog.create_namespace(namespace)
|
|
|
|
assert len(catalog.list_tables(namespace)) == 0
|
|
|
|
|
|
|
|
table = create_table(catalog, namespace, table_name)
|
|
|
|
|
|
|
|
num_rows = 10
|
|
|
|
data = [generate_record() for _ in range(num_rows)]
|
|
|
|
df = pa.Table.from_pylist(data)
|
|
|
|
table.append(df)
|
|
|
|
|
|
|
|
create_clickhouse_iceberg_database(started_cluster, node, CATALOG_NAME)
|
|
|
|
|
|
|
|
expected = DEFAULT_CREATE_TABLE.format(CATALOG_NAME, namespace, table_name)
|
|
|
|
assert expected == node.query(
|
|
|
|
f"SHOW CREATE TABLE {CATALOG_NAME}.`{namespace}.{table_name}`"
|
|
|
|
)
|
|
|
|
|
|
|
|
assert num_rows == int(
|
|
|
|
node.query(f"SELECT count() FROM {CATALOG_NAME}.`{namespace}.{table_name}`")
|
|
|
|
)
|