2021-11-18 08:17:49 +00:00
import logging
import os
2021-12-14 08:06:30 +00:00
import time
2021-11-18 08:17:49 +00:00
import pytest
from helpers . cluster import ClickHouseCluster
from helpers . test_tools import TSV
logging . getLogger ( ) . setLevel ( logging . INFO )
logging . getLogger ( ) . addHandler ( logging . StreamHandler ( ) )
SCRIPT_DIR = os . path . dirname ( os . path . realpath ( __file__ ) )
@pytest.fixture ( scope = " module " )
def started_cluster ( ) :
try :
cluster = ClickHouseCluster ( __file__ )
2021-11-19 01:58:34 +00:00
cluster . add_instance ( ' h0_0_0 ' , main_configs = [ ' configs/config.xml ' ] , extra_configs = [ ' configs/hdfs-site.xml ' ] , with_hive = True )
2021-11-18 08:17:49 +00:00
logging . info ( " Starting cluster ... " )
cluster . start ( )
yield cluster
finally :
cluster . shutdown ( )
def test_create_parquet_table ( started_cluster ) :
logging . info ( ' Start testing creating hive table ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
2021-12-14 08:06:30 +00:00
node . query ( " set input_format_parquet_allow_missing_columns = true " )
2021-11-18 08:17:49 +00:00
result = node . query ( """
2022-01-17 13:39:53 +00:00
DROP TABLE IF EXISTS default . demo_parquet ;
2022-02-28 08:47:50 +00:00
CREATE TABLE default . demo_parquet ( ` id ` Nullable ( String ) , ` score ` Nullable ( Int32 ) , ` day ` Nullable ( String ) ) ENGINE = Hive ( ' thrift://hivetest:9083 ' , ' test ' , ' demo ' ) PARTITION BY ( day ) ;
CREATE TABLE default . demo_parquet_parts ( ` id ` Nullable ( String ) , ` score ` Nullable ( Int32 ) , ` day ` Nullable ( String ) , ` hour ` String ) ENGINE = Hive ( ' thrift://hivetest:9083 ' , ' test ' , ' parquet_demo ' ) PARTITION BY ( day , hour ) ;
2021-11-18 08:17:49 +00:00
""" )
logging . info ( " create result {} " . format ( result ) )
2022-01-11 06:34:36 +00:00
time . sleep ( 120 )
2021-11-18 08:17:49 +00:00
assert result . strip ( ) == ' '
def test_create_orc_table ( started_cluster ) :
logging . info ( ' Start testing creating hive table ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
2022-01-17 13:39:53 +00:00
DROP TABLE IF EXISTS default . demo_orc ;
2021-11-18 08:17:49 +00:00
CREATE TABLE default . demo_orc ( ` id ` Nullable ( String ) , ` score ` Nullable ( Int32 ) , ` day ` Nullable ( String ) ) ENGINE = Hive ( ' thrift://hivetest:9083 ' , ' test ' , ' demo_orc ' ) PARTITION BY ( day )
""" )
logging . info ( " create result {} " . format ( result ) )
assert result . strip ( ) == ' '
def test_create_text_table ( started_cluster ) :
logging . info ( ' Start testing creating hive table ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
2022-01-17 13:39:53 +00:00
DROP TABLE IF EXISTS default . demo_text ;
2021-11-18 08:17:49 +00:00
CREATE TABLE default . demo_text ( ` id ` Nullable ( String ) , ` score ` Nullable ( Int32 ) , ` day ` Nullable ( String ) ) ENGINE = Hive ( ' thrift://hivetest:9083 ' , ' test ' , ' demo_text ' ) PARTITION BY ( tuple ( ) )
""" )
logging . info ( " create result {} " . format ( result ) )
assert result . strip ( ) == ' '
def test_parquet_groupby ( started_cluster ) :
logging . info ( ' Start testing groupby ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
SELECT day , count ( * ) FROM default . demo_parquet group by day order by day
""" )
expected_result = """ 2021-11-01 1
2021 - 11 - 05 2
2021 - 11 - 11 1
2021 - 11 - 16 2
"""
assert result == expected_result
2022-02-28 08:47:50 +00:00
def test_parquet_in_filter ( started_cluster ) :
logging . info ( ' Start testing groupby ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
2022-02-28 09:14:56 +00:00
SELECT count ( * ) FROM default . demo_parquet_parts where day = ' 2021-11-05 ' and hour in ( ' 00 ' )
2022-02-28 08:47:50 +00:00
""" )
expected_result = """ 2021-11-05 2 """
assert result == expected_result
2021-11-18 08:17:49 +00:00
def test_orc_groupby ( started_cluster ) :
logging . info ( ' Start testing groupby ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
SELECT day , count ( * ) FROM default . demo_orc group by day order by day
""" )
expected_result = """ 2021-11-01 1
2021 - 11 - 05 2
2021 - 11 - 11 1
2021 - 11 - 16 2
"""
assert result == expected_result
def test_text_count ( started_cluster ) :
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
SELECT day , count ( * ) FROM default . demo_orc group by day order by day SETTINGS format_csv_delimiter = ' \x01 '
""" )
expected_result = """ 2021-11-01 1
2021 - 11 - 05 2
2021 - 11 - 11 1
2021 - 11 - 16 2
"""
assert result == expected_result
2021-12-14 08:06:30 +00:00
2021-12-20 03:49:45 +00:00
def test_parquet_groupby_with_cache ( started_cluster ) :
2021-12-14 08:06:30 +00:00
logging . info ( ' Start testing groupby ... ' )
node = started_cluster . instances [ ' h0_0_0 ' ]
result = node . query ( """
SELECT day , count ( * ) FROM default . demo_parquet group by day order by day
""" )
expected_result = """ 2021-11-01 1
2021 - 11 - 05 2
2021 - 11 - 11 1
2021 - 11 - 16 2
"""
assert result == expected_result
def test_cache_read_bytes ( started_cluster ) :
node = started_cluster . instances [ ' h0_0_0 ' ]
2022-01-11 06:34:36 +00:00
node . query ( " set input_format_parquet_allow_missing_columns = true " )
result = node . query ( """
2022-01-17 13:39:53 +00:00
DROP TABLE IF EXISTS default . demo_parquet ;
CREATE TABLE default . demo_parquet ( ` id ` Nullable ( String ) , ` score ` Nullable ( Int32 ) , ` day ` Nullable ( String ) ) ENGINE = Hive ( ' thrift://hivetest:9083 ' , ' test ' , ' demo ' ) PARTITION BY ( day )
2022-01-11 06:34:36 +00:00
""" )
result = node . query ( """
SELECT day , count ( * ) FROM default . demo_parquet group by day order by day
""" )
2022-01-07 04:22:57 +00:00
result = node . query ( """
SELECT day , count ( * ) FROM default . demo_parquet group by day order by day
""" )
expected_result = """ 2021-11-01 1
2021 - 11 - 05 2
2021 - 11 - 11 1
2021 - 11 - 16 2
"""
2022-01-13 02:47:11 +00:00
time . sleep ( 120 )
2022-01-07 04:22:57 +00:00
assert result == expected_result
2021-12-14 08:06:30 +00:00
result = node . query ( " select sum(ProfileEvent_ExternalDataSourceLocalCacheReadBytes) from system.metric_log where ProfileEvent_ExternalDataSourceLocalCacheReadBytes > 0 " )
logging . info ( " Read bytes from cache: {} " . format ( result ) )
assert result . strip ( ) != ' 0 '