Fix writing ORC statistics for unsigned types

This commit is contained in:
Michael Kolupaev 2024-05-29 04:10:38 +00:00
parent 764cdb971c
commit fd93097130
6 changed files with 96 additions and 8 deletions

2
contrib/orc vendored

@ -1 +1 @@
Subproject commit e24f2c2a3ca0769c96704ab20ad6f512a83ea2ad
Subproject commit 947cebaf9432d708253ac08dc3012daa6b4ede6f

View File

@ -269,7 +269,12 @@ convertFieldToORCLiteral(const orc::Type & orc_type, const Field & field, DataTy
case orc::SHORT:
case orc::INT:
case orc::LONG: {
/// May throw exception
/// May throw exception.
///
/// In particular, it'll throw if we request the column as unsigned, like this:
/// SELECT * FROM file('t.orc', ORC, 'x UInt8') WHERE x > 10
/// We have to reject this, otherwise it would miss values > 127 (because
/// they're treated as negative by ORC).
auto val = field.get<Int64>();
return orc::Literal(val);
}

View File

@ -315,18 +315,20 @@ void ORCBlockOutputFormat::writeColumn(
if (null_bytemap)
orc_column.hasNulls = true;
/// ORC doesn't have unsigned types, so cast everything to signed and sign-extend to Int64 to
/// make the ORC library calculate min and max correctly.
switch (type->getTypeId())
{
case TypeIndex::Enum8: [[fallthrough]];
case TypeIndex::Int8:
{
/// Note: Explicit cast to avoid clang-tidy error: 'signed char' to 'long' conversion; consider casting to 'unsigned char' first.
writeNumbers<Int8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int8 & value){ return static_cast<int64_t>(value); });
writeNumbers<Int8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const Int8 & value){ return Int64(Int8(value)); });
break;
}
case TypeIndex::UInt8:
{
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt8 & value){ return value; });
writeNumbers<UInt8, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt8 & value){ return Int64(Int8(value)); });
break;
}
case TypeIndex::Enum16: [[fallthrough]];
@ -338,7 +340,7 @@ void ORCBlockOutputFormat::writeColumn(
case TypeIndex::Date: [[fallthrough]];
case TypeIndex::UInt16:
{
writeNumbers<UInt16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt16 & value){ return value; });
writeNumbers<UInt16, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt16 & value){ return Int64(Int16(value)); });
break;
}
case TypeIndex::Date32: [[fallthrough]];
@ -349,12 +351,12 @@ void ORCBlockOutputFormat::writeColumn(
}
case TypeIndex::UInt32:
{
writeNumbers<UInt32, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt32 & value){ return value; });
writeNumbers<UInt32, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const UInt32 & value){ return Int64(Int32(value)); });
break;
}
case TypeIndex::IPv4:
{
writeNumbers<IPv4, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const IPv4 & value){ return value.toUnderType(); });
writeNumbers<IPv4, orc::LongVectorBatch>(orc_column, column, null_bytemap, [](const IPv4 & value){ return Int64(Int32(value.toUnderType())); });
break;
}
case TypeIndex::Int64:

View File

@ -1,4 +1,4 @@
-- Tags: no-fasttest, no-parallel, no-cpu-aarch64
-- Tags: no-fasttest, no-parallel
set output_format_orc_string_as_string = 1;
set output_format_orc_row_index_stride = 100;

View File

@ -0,0 +1,41 @@
-- { echoOn }
select x from file('i8.orc') where indexHint(x = -128);
-128
select x from file('i8.orc') where indexHint(x = 128);
select x from file('u8.orc') where indexHint(x = -128);
-128
select x from file('u8.orc') where indexHint(x = 128);
select x from file('i16.orc') where indexHint(x = -32768);
-32768
select x from file('i16.orc') where indexHint(x = 32768);
select x from file('u16.orc') where indexHint(x = -32768);
-32768
select x from file('u16.orc') where indexHint(x = 32768);
select x from file('i32.orc') where indexHint(x = -2147483648);
-2147483648
select x from file('i32.orc') where indexHint(x = 2147483648);
select x from file('u32.orc') where indexHint(x = -2147483648);
-2147483648
select x from file('u32.orc') where indexHint(x = 2147483648);
select x from file('i64.orc') where indexHint(x = -9223372036854775808);
-9223372036854775808
select x from file('i64.orc') where indexHint(x = 9223372036854775808);
-9223372036854775808
select x from file('u64.orc') where indexHint(x = -9223372036854775808);
-9223372036854775808
select x from file('u64.orc') where indexHint(x = 9223372036854775808);
-9223372036854775808
select x from file('u8.orc', ORC, 'x UInt8') where indexHint(x > 10);
128
select x from file('u8.orc', ORC, 'x UInt64') where indexHint(x > 10);
18446744073709551488
select x from file('u16.orc', ORC, 'x UInt16') where indexHint(x > 10);
32768
select x from file('u16.orc', ORC, 'x UInt64') where indexHint(x > 10);
18446744073709518848
select x from file('u32.orc', ORC, 'x UInt32') where indexHint(x > 10);
2147483648
select x from file('u32.orc', ORC, 'x UInt64') where indexHint(x > 10);
18446744071562067968
select x from file('u64.orc', ORC, 'x UInt64') where indexHint(x > 10);
9223372036854775808

View File

@ -0,0 +1,40 @@
set input_format_orc_filter_push_down = 1;
set engine_file_truncate_on_insert = 1;
insert into function file('i8.orc') select materialize(-128)::Int8 as x;
insert into function file('u8.orc') select materialize(128)::UInt8 as x;
insert into function file('i16.orc') select materialize(-32768)::Int16 as x;
insert into function file('u16.orc') select materialize(32768)::UInt16 as x;
insert into function file('i32.orc') select materialize(-2147483648)::Int32 as x;
insert into function file('u32.orc') select materialize(2147483648)::UInt32 as x;
insert into function file('i64.orc') select materialize(-9223372036854775808)::Int64 as x;
insert into function file('u64.orc') select materialize(9223372036854775808)::UInt64 as x;
-- { echoOn }
select x from file('i8.orc') where indexHint(x = -128);
select x from file('i8.orc') where indexHint(x = 128);
select x from file('u8.orc') where indexHint(x = -128);
select x from file('u8.orc') where indexHint(x = 128);
select x from file('i16.orc') where indexHint(x = -32768);
select x from file('i16.orc') where indexHint(x = 32768);
select x from file('u16.orc') where indexHint(x = -32768);
select x from file('u16.orc') where indexHint(x = 32768);
select x from file('i32.orc') where indexHint(x = -2147483648);
select x from file('i32.orc') where indexHint(x = 2147483648);
select x from file('u32.orc') where indexHint(x = -2147483648);
select x from file('u32.orc') where indexHint(x = 2147483648);
select x from file('i64.orc') where indexHint(x = -9223372036854775808);
select x from file('i64.orc') where indexHint(x = 9223372036854775808);
select x from file('u64.orc') where indexHint(x = -9223372036854775808);
select x from file('u64.orc') where indexHint(x = 9223372036854775808);
select x from file('u8.orc', ORC, 'x UInt8') where indexHint(x > 10);
select x from file('u8.orc', ORC, 'x UInt64') where indexHint(x > 10);
select x from file('u16.orc', ORC, 'x UInt16') where indexHint(x > 10);
select x from file('u16.orc', ORC, 'x UInt64') where indexHint(x > 10);
select x from file('u32.orc', ORC, 'x UInt32') where indexHint(x > 10);
select x from file('u32.orc', ORC, 'x UInt64') where indexHint(x > 10);
select x from file('u64.orc', ORC, 'x UInt64') where indexHint(x > 10);