This commit is contained in:
Michael Kolupaev 2024-08-13 20:55:51 +00:00
parent 5e6f728248
commit 0133806e6c
4 changed files with 23 additions and 13 deletions

View File

@ -743,6 +743,15 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn(
case TypeIndex::IPv6: case TypeIndex::IPv6:
return readIPv6ColumnFromBinaryData(arrow_column, column_name); return readIPv6ColumnFromBinaryData(arrow_column, column_name);
/// ORC format outputs big integers as binary column, because there is no fixed binary in ORC. /// ORC format outputs big integers as binary column, because there is no fixed binary in ORC.
///
/// When ORC/Parquet file says the type is "byte array" or "fixed len byte array",
/// but the clickhouse query says to interpret the column as e.g. Int128, it
/// may mean one of two things:
/// * The byte array is the 16 bytes of Int128, little-endian.
/// * The byte array is an ASCII string containing the Int128 formatted in base 10.
/// There's no reliable way to distinguish these cases. We just guess: if the
/// byte array is variable-length, and the length is different from sizeof(type),
/// we parse as text, otherwise as binary.
case TypeIndex::Int128: case TypeIndex::Int128:
return readColumnWithBigNumberFromBinaryData<ColumnInt128>(arrow_column, column_name, type_hint); return readColumnWithBigNumberFromBinaryData<ColumnInt128>(arrow_column, column_name, type_hint);
case TypeIndex::UInt128: case TypeIndex::UInt128:

View File

@ -224,21 +224,18 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type
if (data.empty()) if (data.empty())
return Field(); return Field();
/// Long integers. /// Long integers, encoded either as text or as little-endian bytes.
auto reinterpret_fixed_string = [&](auto x) /// The parquet file doesn't know that it's numbers, so the min/max are produced by comparing
{ /// strings lexicographically. So these min and max are mostly useless to us.
if (data.size() != sizeof(x)) /// There's one case where they're not useless: min == max; currently we don't make use of this.
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpected {} size: {}", fieldTypeToString(Field::TypeToEnum<decltype(x)>::value), data.size());
memcpy(&x, data.data(), data.size());
return Field(x);
};
switch (type_hint) switch (type_hint)
{ {
case TypeIndex::UInt128: return reinterpret_fixed_string(UInt128(0)); case TypeIndex::UInt128:
case TypeIndex::UInt256: return reinterpret_fixed_string(UInt256(0)); case TypeIndex::UInt256:
case TypeIndex::Int128: return reinterpret_fixed_string(Int128(0)); case TypeIndex::Int128:
case TypeIndex::Int256: return reinterpret_fixed_string(Int256(0)); case TypeIndex::Int256:
case TypeIndex::IPv6: return reinterpret_fixed_string(IPv6(0)); case TypeIndex::IPv6:
return Field();
default: break; default: break;
} }

View File

@ -1 +1,2 @@
424242424242424242424242424242424242424242424242424242 424242424242424242424242424242424242424242424242424242
22707864971053448441042714569797161695738549521977760418632926980540162388532

View File

@ -5,5 +5,8 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh # shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh . "$CUR_DIR"/../shell_config.sh
# This is parsed as text.
$CLICKHOUSE_LOCAL -q "select toString(424242424242424242424242424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table" $CLICKHOUSE_LOCAL -q "select toString(424242424242424242424242424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table"
# But this is parsed as binary because text length happens to be 32 bytes. Not ideal.
$CLICKHOUSE_LOCAL -q "select toString(42424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table"