From 0133806e6c991325b5ace29469d8f450969c3ec0 Mon Sep 17 00:00:00 2001 From: Michael Kolupaev Date: Tue, 13 Aug 2024 20:55:51 +0000 Subject: [PATCH] Fix --- .../Formats/Impl/ArrowColumnToCHColumn.cpp | 9 ++++++++ .../Formats/Impl/ParquetBlockInputFormat.cpp | 23 ++++++++----------- ...arquet_big_integer_compatibility.reference | 1 + ...02786_parquet_big_integer_compatibility.sh | 3 +++ 4 files changed, 23 insertions(+), 13 deletions(-) diff --git a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp index ed91913de4d..5e7f763dfbc 100644 --- a/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp +++ b/src/Processors/Formats/Impl/ArrowColumnToCHColumn.cpp @@ -743,6 +743,15 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn( case TypeIndex::IPv6: return readIPv6ColumnFromBinaryData(arrow_column, column_name); /// ORC format outputs big integers as binary column, because there is no fixed binary in ORC. + /// + /// When ORC/Parquet file says the type is "byte array" or "fixed len byte array", + /// but the clickhouse query says to interpret the column as e.g. Int128, it + /// may mean one of two things: + /// * The byte array is the 16 bytes of Int128, little-endian. + /// * The byte array is an ASCII string containing the Int128 formatted in base 10. + /// There's no reliable way to distinguish these cases. We just guess: if the + /// byte array is variable-length, and the length is different from sizeof(type), + /// we parse as text, otherwise as binary. case TypeIndex::Int128: return readColumnWithBigNumberFromBinaryData(arrow_column, column_name, type_hint); case TypeIndex::UInt128: diff --git a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp index 3f2e701afc2..c6167e572df 100644 --- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp @@ -224,21 +224,18 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type if (data.empty()) return Field(); - /// Long integers. - auto reinterpret_fixed_string = [&](auto x) - { - if (data.size() != sizeof(x)) - throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpected {} size: {}", fieldTypeToString(Field::TypeToEnum::value), data.size()); - memcpy(&x, data.data(), data.size()); - return Field(x); - }; + /// Long integers, encoded either as text or as little-endian bytes. + /// The parquet file doesn't know that it's numbers, so the min/max are produced by comparing + /// strings lexicographically. So these min and max are mostly useless to us. + /// There's one case where they're not useless: min == max; currently we don't make use of this. switch (type_hint) { - case TypeIndex::UInt128: return reinterpret_fixed_string(UInt128(0)); - case TypeIndex::UInt256: return reinterpret_fixed_string(UInt256(0)); - case TypeIndex::Int128: return reinterpret_fixed_string(Int128(0)); - case TypeIndex::Int256: return reinterpret_fixed_string(Int256(0)); - case TypeIndex::IPv6: return reinterpret_fixed_string(IPv6(0)); + case TypeIndex::UInt128: + case TypeIndex::UInt256: + case TypeIndex::Int128: + case TypeIndex::Int256: + case TypeIndex::IPv6: + return Field(); default: break; } diff --git a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference index 7764974255b..877bb5f390f 100644 --- a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference +++ b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.reference @@ -1 +1,2 @@ 424242424242424242424242424242424242424242424242424242 +22707864971053448441042714569797161695738549521977760418632926980540162388532 diff --git a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh index 8865b2e7aab..0f590027f19 100755 --- a/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh +++ b/tests/queries/0_stateless/02786_parquet_big_integer_compatibility.sh @@ -5,5 +5,8 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CUR_DIR"/../shell_config.sh +# This is parsed as text. $CLICKHOUSE_LOCAL -q "select toString(424242424242424242424242424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table" +# But this is parsed as binary because text length happens to be 32 bytes. Not ideal. +$CLICKHOUSE_LOCAL -q "select toString(42424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table"