ClickHouse/tests/queries/0_stateless/02735_parquet_encoder.sql
2023-08-21 14:15:52 -07:00

183 lines
10 KiB
SQL

-- Tags: no-fasttest, no-parallel
set output_format_parquet_use_custom_encoder = 1;
set output_format_parquet_row_group_size = 1000;
set output_format_parquet_data_page_size = 800;
set output_format_parquet_batch_size = 100;
set output_format_parquet_row_group_size_bytes = 1000000000;
set engine_file_truncate_on_insert=1;
-- Write random data to parquet file, then read from it and check that it matches what we wrote.
-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),
-- Array(Nullable(primitive)), Array(Array(primitive)), Map(primitive, primitive), etc.
drop table if exists basic_types_02735;
create temporary table basic_types_02735 as select * from generateRandom('
u8 UInt8,
u16 UInt16,
u32 UInt32,
u64 UInt64,
i8 Int8,
i16 Int16,
i32 Int32,
i64 Int64,
date Date,
date32 Date32,
datetime DateTime,
datetime64 DateTime64,
enum8 Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3),
enum16 Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000),
float32 Float32,
float64 Float64,
str String,
fstr FixedString(12),
u128 UInt128,
u256 UInt256,
i128 Int128,
i256 Int256,
decimal32 Decimal32(3),
decimal64 Decimal64(10),
decimal128 Decimal128(20),
decimal256 Decimal256(40),
ipv4 IPv4,
ipv6 IPv6') limit 10101;
insert into function file(basic_types_02735.parquet) select * from basic_types_02735;
desc file(basic_types_02735.parquet);
select (select sum(cityHash64(*)) from basic_types_02735) - (select sum(cityHash64(*)) from file(basic_types_02735.parquet));
drop table basic_types_02735;
drop table if exists nullables_02735;
create temporary table nullables_02735 as select * from generateRandom('
u16 Nullable(UInt16),
i64 Nullable(Int64),
datetime64 Nullable(DateTime64),
enum8 Nullable(Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3)),
float64 Nullable(Float64),
str Nullable(String),
fstr Nullable(FixedString(12)),
i256 Nullable(Int256),
decimal256 Nullable(Decimal256(40)),
ipv6 Nullable(IPv6)') limit 10000;
insert into function file(nullables_02735.parquet) select * from nullables_02735;
select (select sum(cityHash64(*)) from nullables_02735) - (select sum(cityHash64(*)) from file(nullables_02735.parquet));
drop table nullables_02735;
-- TODO: When cityHash64() fully supports Nullable: https://github.com/ClickHouse/ClickHouse/pull/48625
-- the next two blocks can be simplified: arrays_out_02735 intermediate table is not needed,
-- a.csv and b.csv are not needed.
drop table if exists arrays_02735;
drop table if exists arrays_out_02735;
create table arrays_02735 engine = Memory as select * from generateRandom('
u32 Array(UInt32),
i8 Array(Int8),
datetime Array(DateTime),
enum16 Array(Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000)),
float32 Array(Float32),
str Array(String),
fstr Array(FixedString(12)),
u128 Array(UInt128),
decimal64 Array(Decimal64(10)),
ipv4 Array(IPv4),
msi Map(String, Int16),
tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 10000;
insert into function file(arrays_02735.parquet) select * from arrays_02735;
create temporary table arrays_out_02735 as arrays_02735;
insert into arrays_out_02735 select * from file(arrays_02735.parquet);
select (select sum(cityHash64(*)) from arrays_02735) - (select sum(cityHash64(*)) from arrays_out_02735);
--select (select sum(cityHash64(*)) from arrays_02735) -
-- (select sum(cityHash64(u32, i8, datetime, enum16, float32, str, fstr, arrayMap(x->reinterpret(x, 'UInt128'), u128), decimal64, ipv4, msi, tup)) from file(arrays_02735.parquet));
drop table arrays_02735;
drop table arrays_out_02735;
drop table if exists madness_02735;
create temporary table madness_02735 as select * from generateRandom('
aa Array(Array(UInt32)),
aaa Array(Array(Array(UInt32))),
an Array(Nullable(String)),
aan Array(Array(Nullable(FixedString(10)))),
l LowCardinality(String),
ln LowCardinality(Nullable(FixedString(11))),
al Array(LowCardinality(UInt128)),
aaln Array(Array(LowCardinality(Nullable(String)))),
mln Map(LowCardinality(String), Nullable(Int8)),
t Tuple(Map(FixedString(5), Tuple(Array(UInt16), Nullable(UInt16), Array(Tuple(Int8, Decimal64(10))))), Tuple(kitchen UInt64, sink String)),
n Nested(hello UInt64, world Tuple(first String, second FixedString(1)))
') limit 10000;
insert into function file(madness_02735.parquet) select * from madness_02735;
insert into function file(a.csv) select * from madness_02735 order by tuple(*);
insert into function file(b.csv) select aa, aaa, an, aan, l, ln, arrayMap(x->reinterpret(x, 'UInt128'), al) as al_, aaln, mln, t, n.hello, n.world from file(madness_02735.parquet) order by tuple(aa, aaa, an, aan, l, ln, al_, aaln, mln, t, n.hello, n.world);
select (select sum(cityHash64(*)) from file(a.csv, LineAsString)) - (select sum(cityHash64(*)) from file(b.csv, LineAsString));
--select (select sum(cityHash64(*)) from madness_02735) -
-- (select sum(cityHash64(aa, aaa, an, aan, l, ln, map(x->reinterpret(x, 'UInt128'), al), aaln, mln, t, n.hello, n.world)) from file(madness_02735.parquet));
drop table madness_02735;
-- Merging input blocks into bigger row groups.
insert into function file(squash_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1;
select num_columns, num_rows, num_row_groups from file(squash_02735.parquet, ParquetMetadata);
-- Row group size limit in bytes.
insert into function file(row_group_bytes_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1, output_format_parquet_row_group_size_bytes = 5;
select num_columns, num_rows, num_row_groups from file(row_group_bytes_02735.parquet, ParquetMetadata);
-- Row group size limit in rows.
insert into function file(tiny_row_groups_02735.parquet) select * from numbers(3) settings output_format_parquet_row_group_size = 1;
select num_columns, num_rows, num_row_groups from file(tiny_row_groups_02735.parquet, ParquetMetadata);
-- 1M unique 8-byte values should exceed dictionary_size_limit (1 MB).
insert into function file(big_column_chunk_02735.parquet) select number from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
select num_columns, num_rows, num_row_groups from file(big_column_chunk_02735.parquet, ParquetMetadata);
select sum(cityHash64(number)) from file(big_column_chunk_02735.parquet);
-- Check statistics: signed vs unsigned, null count. Use enough rows to produce multiple pages.
insert into function file(statistics_02735.parquet) select 100 + number%200 as a, toUInt32(number * 3000) as u, toInt32(number * 3000) as i, if(number % 10 == 9, toString(number), null) as s from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
select num_columns, num_rows, num_row_groups from file(statistics_02735.parquet, ParquetMetadata);
select tupleElement(c, 'statistics') from file(statistics_02735.parquet, ParquetMetadata) array join tupleElement(row_groups[1], 'columns') as c;
-- Statistics string length limit (max_statistics_size).
insert into function file(long_string_02735.parquet) select toString(range(number * 2000)) from numbers(2);
select tupleElement(tupleElement(row_groups[1], 'columns'), 'statistics') from file(long_string_02735.parquet, ParquetMetadata);
-- Compression setting.
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='zstd';
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='none';
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
insert into function file(compressed_02735.parquet) select if(number%3==1, NULL, 42) as x from numbers(70) settings output_format_parquet_compression_method='zstd';
select sum(cityHash64(*)) from file(compressed_02735.parquet);
-- Single-threaded encoding and Arrow encoder.
drop table if exists other_encoders_02735;
create temporary table other_encoders_02735 as select number, number*2 from numbers(10000);
insert into function file(single_thread_02735.parquet) select * from other_encoders_02735 settings max_threads = 1;
select sum(cityHash64(*)) from file(single_thread_02735.parquet);
insert into function file(arrow_02735.parquet) select * from other_encoders_02735 settings output_format_parquet_use_custom_encoder = 0;
select sum(cityHash64(*)) from file(arrow_02735.parquet);
-- String -> binary vs string; FixedString -> fixed-length-binary vs binary vs string.
insert into function file(strings1_02735.parquet) select 'never', toFixedString('gonna', 5) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 1;
select columns.5, columns.6 from file(strings1_02735.parquet, ParquetMetadata) array join columns;
insert into function file(strings2_02735.parquet) select 'give', toFixedString('you', 3) settings output_format_parquet_string_as_string = 0, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
select columns.5, columns.6 from file(strings2_02735.parquet, ParquetMetadata) array join columns;
insert into function file(strings3_02735.parquet) select toFixedString('up', 2) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
select columns.5, columns.6 from file(strings3_02735.parquet, ParquetMetadata) array join columns;
select * from file(strings1_02735.parquet);
select * from file(strings2_02735.parquet);
select * from file(strings3_02735.parquet);
-- DateTime64 with different units.
insert into function file(datetime64_02735.parquet) select
toDateTime64(number / 1e3, 3) as ms,
toDateTime64(number / 1e6, 6) as us,
toDateTime64(number / 1e9, 9) as ns,
toDateTime64(number / 1e2, 2) as cs,
toDateTime64(number, 0) as s,
toDateTime64(number / 1e7, 7) as dus
from numbers(2000);
desc file(datetime64_02735.parquet);
select sum(cityHash64(*)) from file(datetime64_02735.parquet);