-- Tags: no-fasttest, no-parallel

set output_format_parquet_use_custom_encoder = 1;
set output_format_parquet_row_group_size = 1000;
set output_format_parquet_data_page_size = 800;
set output_format_parquet_batch_size = 100;
set output_format_parquet_row_group_size_bytes = 1000000000;
set engine_file_truncate_on_insert=1;

-- Write random data to parquet file, then read from it and check that it matches what we wrote.
-- Do this for all kinds of data types: primitive, Nullable(primitive), Array(primitive),
-- Array(Nullable(primitive)), Array(Array(primitive)), Map(primitive, primitive), etc.

drop table if exists basic_types_02735;
create temporary table basic_types_02735 as select * from generateRandom('
    u8 UInt8,
    u16 UInt16,
    u32 UInt32,
    u64 UInt64,
    i8 Int8,
    i16 Int16,
    i32 Int32,
    i64 Int64,
    date Date,
    date32 Date32,
    datetime DateTime,
    datetime64 DateTime64,
    enum8 Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3),
    enum16 Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000),
    float32 Float32,
    float64 Float64,
    str String,
    fstr FixedString(12),
    u128 UInt128,
    u256 UInt256,
    i128 Int128,
    i256 Int256,
    decimal32 Decimal32(3),
    decimal64 Decimal64(10),
    decimal128 Decimal128(20),
    decimal256 Decimal256(40),
    ipv4 IPv4,
    ipv6 IPv6') limit 10101;
insert into function file(basic_types_02735.parquet) select * from basic_types_02735;
desc file(basic_types_02735.parquet);
select (select sum(cityHash64(*)) from basic_types_02735) - (select sum(cityHash64(*)) from file(basic_types_02735.parquet));
drop table basic_types_02735;


drop table if exists nullables_02735;
create temporary table nullables_02735 as select * from generateRandom('
    u16 Nullable(UInt16),
    i64 Nullable(Int64),
    datetime64 Nullable(DateTime64),
    enum8 Nullable(Enum8(''x'' = 1, ''y'' = 2, ''z'' = 3)),
    float64 Nullable(Float64),
    str Nullable(String),
    fstr Nullable(FixedString(12)),
    i256 Nullable(Int256),
    decimal256 Nullable(Decimal256(40)),
    ipv6 Nullable(IPv6)') limit 10000;
insert into function file(nullables_02735.parquet) select * from nullables_02735;
select (select sum(cityHash64(*)) from nullables_02735) - (select sum(cityHash64(*)) from file(nullables_02735.parquet));
drop table nullables_02735;


-- TODO: When cityHash64() fully supports Nullable: https://github.com/ClickHouse/ClickHouse/pull/48625
--       the next two blocks can be simplified: arrays_out_02735 intermediate table is not needed,
--       a.csv and b.csv are not needed.

drop table if exists arrays_02735;
drop table if exists arrays_out_02735;
create table arrays_02735 engine = Memory as select * from generateRandom('
    u32 Array(UInt32),
    i8 Array(Int8),
    datetime Array(DateTime),
    enum16 Array(Enum16(''xx'' = 1000, ''yy'' = 2000, ''zz'' = 3000)),
    float32 Array(Float32),
    str Array(String),
    fstr Array(FixedString(12)),
    u128 Array(UInt128),
    decimal64 Array(Decimal64(10)),
    ipv4 Array(IPv4),
    msi Map(String, Int16),
    tup Tuple(FixedString(3), Array(String), Map(Int8, Date))') limit 10000;
insert into function file(arrays_02735.parquet) select * from arrays_02735;
create temporary table arrays_out_02735 as arrays_02735;
insert into arrays_out_02735 select * from file(arrays_02735.parquet);
select (select sum(cityHash64(*)) from arrays_02735) - (select sum(cityHash64(*)) from arrays_out_02735);
--select (select sum(cityHash64(*)) from arrays_02735) -
--       (select sum(cityHash64(u32, i8, datetime, enum16, float32, str, fstr, arrayMap(x->reinterpret(x, 'UInt128'), u128), decimal64, ipv4, msi, tup)) from file(arrays_02735.parquet));
drop table arrays_02735;
drop table arrays_out_02735;


drop table if exists madness_02735;
create temporary table madness_02735 as select * from generateRandom('
    aa Array(Array(UInt32)),
    aaa Array(Array(Array(UInt32))),
    an Array(Nullable(String)),
    aan Array(Array(Nullable(FixedString(10)))),
    l LowCardinality(String),
    ln LowCardinality(Nullable(FixedString(11))),
    al Array(LowCardinality(UInt128)),
    aaln Array(Array(LowCardinality(Nullable(String)))),
    mln Map(LowCardinality(String), Nullable(Int8)),
    t Tuple(Map(FixedString(5), Tuple(Array(UInt16), Nullable(UInt16), Array(Tuple(Int8, Decimal64(10))))), Tuple(kitchen UInt64, sink String)),
    n Nested(hello UInt64, world Tuple(first String, second FixedString(1)))
    ') limit 10000;
insert into function file(madness_02735.parquet) select * from madness_02735;
insert into function file(a.csv) select * from madness_02735 order by tuple(*);
insert into function file(b.csv) select aa, aaa, an, aan, l, ln, arrayMap(x->reinterpret(x, 'UInt128'), al) as al_, aaln, mln, t, n.hello, n.world from file(madness_02735.parquet) order by tuple(aa, aaa, an, aan, l, ln, al_, aaln, mln, t, n.hello, n.world);
select (select sum(cityHash64(*)) from file(a.csv, LineAsString)) - (select sum(cityHash64(*)) from file(b.csv, LineAsString));
--select (select sum(cityHash64(*)) from madness_02735) -
--       (select sum(cityHash64(aa, aaa, an, aan, l, ln, map(x->reinterpret(x, 'UInt128'), al), aaln, mln, t, n.hello, n.world)) from file(madness_02735.parquet));
drop table madness_02735;


-- Merging input blocks into bigger row groups.
insert into function file(squash_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1;
select num_columns, num_rows, num_row_groups from file(squash_02735.parquet, ParquetMetadata);

-- Row group size limit in bytes.
insert into function file(row_group_bytes_02735.parquet) select '012345' union all select '543210' settings max_block_size = 1, output_format_parquet_row_group_size_bytes = 5;
select num_columns, num_rows, num_row_groups from file(row_group_bytes_02735.parquet, ParquetMetadata);

-- Row group size limit in rows.
insert into function file(tiny_row_groups_02735.parquet) select * from numbers(3) settings output_format_parquet_row_group_size = 1;
select num_columns, num_rows, num_row_groups from file(tiny_row_groups_02735.parquet, ParquetMetadata);

-- 1M unique 8-byte values should exceed dictionary_size_limit (1 MB).
insert into function file(big_column_chunk_02735.parquet) select number from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
select num_columns, num_rows, num_row_groups from file(big_column_chunk_02735.parquet, ParquetMetadata);
select sum(cityHash64(number)) from file(big_column_chunk_02735.parquet);

-- Check statistics: signed vs unsigned, null count. Use enough rows to produce multiple pages.
insert into function file(statistics_02735.parquet) select 100 + number%200 as a, toUInt32(number * 3000) as u, toInt32(number * 3000) as i, if(number % 10 == 9, toString(number), null) as s from numbers(1000000) settings output_format_parquet_row_group_size = 1000000;
select num_columns, num_rows, num_row_groups from file(statistics_02735.parquet, ParquetMetadata);
select tupleElement(c, 'statistics') from file(statistics_02735.parquet, ParquetMetadata) array join tupleElement(row_groups[1], 'columns') as c;

-- Statistics string length limit (max_statistics_size).
insert into function file(long_string_02735.parquet) select toString(range(number * 2000)) from numbers(2);
select tupleElement(tupleElement(row_groups[1], 'columns'), 'statistics') from file(long_string_02735.parquet, ParquetMetadata);

-- Compression setting.
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='zstd';
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
insert into function file(compressed_02735.parquet) select concat('aaaaaaaaaaaaaaaa', toString(number)) as s from numbers(1000) settings output_format_parquet_row_group_size = 10000, output_format_parquet_compression_method='none';
select total_compressed_size < 10000, total_uncompressed_size > 15000 from file(compressed_02735.parquet, ParquetMetadata);
insert into function file(compressed_02735.parquet) select if(number%3==1, NULL, 42) as x from numbers(70) settings output_format_parquet_compression_method='zstd';
select sum(cityHash64(*)) from file(compressed_02735.parquet);

-- Single-threaded encoding and Arrow encoder.
drop table if exists other_encoders_02735;
create temporary table other_encoders_02735 as select number, number*2 from numbers(10000);
insert into function file(single_thread_02735.parquet) select * from other_encoders_02735 settings max_threads = 1;
select sum(cityHash64(*)) from file(single_thread_02735.parquet);
insert into function file(arrow_02735.parquet) select * from other_encoders_02735 settings output_format_parquet_use_custom_encoder = 0;
select sum(cityHash64(*)) from file(arrow_02735.parquet);

-- String -> binary vs string; FixedString -> fixed-length-binary vs binary vs string.
insert into function file(strings1_02735.parquet) select 'never', toFixedString('gonna', 5) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 1;
select columns.5, columns.6 from file(strings1_02735.parquet, ParquetMetadata) array join columns;
insert into function file(strings2_02735.parquet) select 'give', toFixedString('you', 3) settings output_format_parquet_string_as_string = 0, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
select columns.5, columns.6 from file(strings2_02735.parquet, ParquetMetadata) array join columns;
insert into function file(strings3_02735.parquet) select toFixedString('up', 2) settings output_format_parquet_string_as_string = 1, output_format_parquet_fixed_string_as_fixed_byte_array = 0;
select columns.5, columns.6 from file(strings3_02735.parquet, ParquetMetadata) array join columns;
select * from file(strings1_02735.parquet);
select * from file(strings2_02735.parquet);
select * from file(strings3_02735.parquet);

-- DateTime64 with different units.
insert into function file(datetime64_02735.parquet) select
    toDateTime64(number / 1e3, 3) as ms,
    toDateTime64(number / 1e6, 6) as us,
    toDateTime64(number / 1e9, 9) as ns,
    toDateTime64(number / 1e2, 2) as cs,
    toDateTime64(number, 0) as s,
    toDateTime64(number / 1e7, 7) as dus
    from numbers(2000);
desc file(datetime64_02735.parquet);
select sum(cityHash64(*)) from file(datetime64_02735.parquet);