mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-30 11:32:03 +00:00
183 lines
10 KiB
SQL
183 lines
10 KiB
SQL
-- Tags: no-fasttest, no-parallel
|
|
|
|
set output_format_parquet_use_custom_encoder = 1;
|
|
set output_format_parquet_row_group_size = 1000;
|
|
set output_format_parquet_data_page_size = 800;
|
|
set output_format_parquet_batch_size = 100;
|
|
set output_format_parquet_row_group_size_bytes = 1000000000;
|
|
set engine_file_truncate_on_insert=1;
|
|
|
|
-- Write random data to parquet file, then read from it and check that it matches what we wrote.
|
|
-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),
|
|
-- Array(Nullable(primitive)), Array(Array(primitive)), Map(primitive, primitive), etc.
|
|
|
|
drop table if exists basic_types_02735;
|
|
create temporary table basic_types_02735 as select * from generateRandom('
|
|
u8 UInt8,
|
|
u16 UInt16,
|
|
u32 UInt32,
|
|
u64 UInt64,
|
|
i8 Int8,
|
|
i16 Int16,
|
|
i32 Int32,
|
|
i64 Int64,
|
|
date Date,
|
|
date32 Date32,
|
|
datetime DateTime,
|
|
datetime64 DateTime64,
|
|
enum8 Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3),
|
|
enum16 Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000),
|
|
float32 Float32,
|
|
float64 Float64,
|
|
str String,
|
|
fstr FixedString(12),
|
|
u128 UInt128,
|
|
u256 UInt256,
|
|
i128 Int128,
|
|
i256 Int256,
|
|
decimal32 Decimal32(3),
|
|
decimal64 Decimal64(10),
|
|
decimal128 Decimal128(20),
|
|
decimal256 Decimal256(40),
|
|
ipv4 IPv4,
|
|
ipv6 IPv6') limit 10101;
|
|
insert into function file(basic_types_02735.parquet) select * from basic_types_02735;
|
|
desc file(basic_types_02735.parquet);
|
|
select (select sum(cityHash64(*)) from basic_types_02735) - (select sum(cityHash64(*)) from file(basic_types_02735.parquet));
|
|
drop table basic_types_02735;
|
|
|
|
|
|
drop table if exists nullables_02735;
|
|
create temporary table nullables_02735 as select * from generateRandom('
|
|
u16 Nullable(UInt16),
|
|
i64 Nullable(Int64),
|
|
datetime64 Nullable(DateTime64),
|
|
enum8 Nullable(Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3)),
|
|
float64 Nullable(Float64),
|
|
str Nullable(String),
|
|
fstr Nullable(FixedString(12)),
|
|
i256 Nullable(Int256),
|
|
decimal256 Nullable(Decimal256(40)),
|
|
ipv6 Nullable(IPv6)') limit 10000;
|
|
insert into function file(nullables_02735.parquet) select * from nullables_02735;
|
|
select (select sum(cityHash64(*)) from nullables_02735) - (select sum(cityHash64(*)) from file(nullables_02735.parquet));
|
|
drop table nullables_02735;
|
|
|
|
|
|
-- TODO: When cityHash64() fully supports Nullable: https://github.com/ClickHouse/ClickHouse/pull/48625
|
|
-- the next two blocks can be simplified: arrays_out_02735 intermediate table is not needed,
|
|
-- a.csv and b.csv are not needed.
|
|
|
|
drop table if exists arrays_02735;
|
|
drop table if exists arrays_out_02735;
|
|
create table arrays_02735 engine = Memory as select * from generateRandom('
|
|
u32 Array(UInt32),
|
|
i8 Array(Int8),
|
|
datetime Array(DateTime),
|
|
enum16 Array(Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000)),
|
|
float32 Array(Float32),
|
|
str Array(String),
|
|
fstr Array(FixedString(12)),
|
|
u128 Array(UInt128),
|
|
decimal64 Array(Decimal64(10)),
|
|
ipv4 Array(IPv4),
|
|
msi Map(String, Int16),
|
|
tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 10000;
|
|
insert into function file(arrays_02735.parquet) select * from arrays_02735;
|
|
create temporary table arrays_out_02735 as arrays_02735;
|
|
insert into arrays_out_02735 select * from file(arrays_02735.parquet);
|
|
select (select sum(cityHash64(*)) from arrays_02735) - (select sum(cityHash64(*)) from arrays_out_02735);
|
|
--select (select sum(cityHash64(*)) from arrays_02735) -
|
|
-- (select sum(cityHash64(u32, i8, datetime, enum16, float32, str, fstr, arrayMap(x->reinterpret(x, 'UInt128'), u128), decimal64, ipv4, msi, tup)) from file(arrays_02735.parquet));
|
|
drop table arrays_02735;
|
|
drop table arrays_out_02735;
|
|
|
|
|
|
drop table if exists madness_02735;
|
|
create temporary table madness_02735 as select * from generateRandom('
|
|
aa Array(Array(UInt32)),
|
|
aaa Array(Array(Array(UInt32))),
|
|
an Array(Nullable(String)),
|
|
aan Array(Array(Nullable(FixedString(10)))),
|
|
l LowCardinality(String),
|
|
ln LowCardinality(Nullable(FixedString(11))),
|
|
al Array(LowCardinality(UInt128)),
|
|
aaln Array(Array(LowCardinality(Nullable(String)))),
|
|
mln Map(LowCardinality(String), Nullable(Int8)),
|
|
t Tuple(Map(FixedString(5), Tuple(Array(UInt16), Nullable(UInt16), Array(Tuple(Int8, Decimal64(10))))), Tuple(kitchen UInt64, sink String)),
|
|
n Nested(hello UInt64, world Tuple(first String, second FixedString(1)))
|
|
') limit 10000;
|
|
insert into function file(madness_02735.parquet) select * from madness_02735;
|
|
insert into function file(a.csv) select * from madness_02735 order by tuple(*);
|
|
insert into function file(b.csv) select aa, aaa, an, aan, l, ln, arrayMap(x->reinterpret(x, 'UInt128'), al) as al_, aaln, mln, t, n.hello, n.world from file(madness_02735.parquet) order by tuple(aa, aaa, an, aan, l, ln, al_, aaln, mln, t, n.hello, n.world);
|
|
select (select sum(cityHash64(*)) from file(a.csv, LineAsString)) - (select sum(cityHash64(*)) from file(b.csv, LineAsString));
|
|
--select (select sum(cityHash64(*)) from madness_02735) -
|
|
-- (select sum(cityHash64(aa, aaa, an, aan, l, ln, map(x->reinterpret(x, 'UInt128'), al), aaln, mln, t, n.hello, n.world)) from file(madness_02735.parquet));
|
|
drop table madness_02735;
|
|
|
|
|
|
-- Merging input blocks into bigger row groups.
|
|
insert into function file(squash_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1;
|
|
select num_columns, num_rows, num_row_groups from file(squash_02735.parquet, ParquetMetadata);
|
|
|
|
-- Row group size limit in bytes.
|
|
insert into function file(row_group_bytes_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1, output_format_parquet_row_group_size_bytes = 5;
|
|
select num_columns, num_rows, num_row_groups from file(row_group_bytes_02735.parquet, ParquetMetadata);
|
|
|
|
-- Row group size limit in rows.
|
|
insert into function file(tiny_row_groups_02735.parquet) select * from numbers(3) settings output_format_parquet_row_group_size = 1;
|
|
select num_columns, num_rows, num_row_groups from file(tiny_row_groups_02735.parquet, ParquetMetadata);
|
|
|
|
-- 1M unique 8-byte values should exceed dictionary_size_limit (1 MB).
|
|
insert into function file(big_column_chunk_02735.parquet) select number from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
|
|
select num_columns, num_rows, num_row_groups from file(big_column_chunk_02735.parquet, ParquetMetadata);
|
|
select sum(cityHash64(number)) from file(big_column_chunk_02735.parquet);
|
|
|
|
-- Check statistics: signed vs unsigned, null count. Use enough rows to produce multiple pages.
|
|
insert into function file(statistics_02735.parquet) select 100 + number%200 as a, toUInt32(number * 3000) as u, toInt32(number * 3000) as i, if(number % 10 == 9, toString(number), null) as s from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
|
|
select num_columns, num_rows, num_row_groups from file(statistics_02735.parquet, ParquetMetadata);
|
|
select tupleElement(c, 'statistics') from file(statistics_02735.parquet, ParquetMetadata) array join tupleElement(row_groups[1], 'columns') as c;
|
|
|
|
-- Statistics string length limit (max_statistics_size).
|
|
insert into function file(long_string_02735.parquet) select toString(range(number * 2000)) from numbers(2);
|
|
select tupleElement(tupleElement(row_groups[1], 'columns'), 'statistics') from file(long_string_02735.parquet, ParquetMetadata);
|
|
|
|
-- Compression setting.
|
|
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='zstd';
|
|
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
|
|
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='none';
|
|
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
|
|
insert into function file(compressed_02735.parquet) select if(number%3==1, NULL, 42) as x from numbers(70) settings output_format_parquet_compression_method='zstd';
|
|
select sum(cityHash64(*)) from file(compressed_02735.parquet);
|
|
|
|
-- Single-threaded encoding and Arrow encoder.
|
|
drop table if exists other_encoders_02735;
|
|
create temporary table other_encoders_02735 as select number, number*2 from numbers(10000);
|
|
insert into function file(single_thread_02735.parquet) select * from other_encoders_02735 settings max_threads = 1;
|
|
select sum(cityHash64(*)) from file(single_thread_02735.parquet);
|
|
insert into function file(arrow_02735.parquet) select * from other_encoders_02735 settings output_format_parquet_use_custom_encoder = 0;
|
|
select sum(cityHash64(*)) from file(arrow_02735.parquet);
|
|
|
|
-- String -> binary vs string; FixedString -> fixed-length-binary vs binary vs string.
|
|
insert into function file(strings1_02735.parquet) select 'never', toFixedString('gonna', 5) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 1;
|
|
select columns.5, columns.6 from file(strings1_02735.parquet, ParquetMetadata) array join columns;
|
|
insert into function file(strings2_02735.parquet) select 'give', toFixedString('you', 3) settings output_format_parquet_string_as_string = 0, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
|
|
select columns.5, columns.6 from file(strings2_02735.parquet, ParquetMetadata) array join columns;
|
|
insert into function file(strings3_02735.parquet) select toFixedString('up', 2) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
|
|
select columns.5, columns.6 from file(strings3_02735.parquet, ParquetMetadata) array join columns;
|
|
select * from file(strings1_02735.parquet);
|
|
select * from file(strings2_02735.parquet);
|
|
select * from file(strings3_02735.parquet);
|
|
|
|
-- DateTime64 with different units.
|
|
insert into function file(datetime64_02735.parquet) select
|
|
toDateTime64(number / 1e3, 3) as ms,
|
|
toDateTime64(number / 1e6, 6) as us,
|
|
toDateTime64(number / 1e9, 9) as ns,
|
|
toDateTime64(number / 1e2, 2) as cs,
|
|
toDateTime64(number, 0) as s,
|
|
toDateTime64(number / 1e7, 7) as dus
|
|
from numbers(2000);
|
|
desc file(datetime64_02735.parquet);
|
|
select sum(cityHash64(*)) from file(datetime64_02735.parquet);
|