mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-27 10:02:01 +00:00
Fix
This commit is contained in:
parent
5e6f728248
commit
0133806e6c
@ -743,6 +743,15 @@ static ColumnWithTypeAndName readNonNullableColumnFromArrowColumn(
|
||||
case TypeIndex::IPv6:
|
||||
return readIPv6ColumnFromBinaryData(arrow_column, column_name);
|
||||
/// ORC format outputs big integers as binary column, because there is no fixed binary in ORC.
|
||||
///
|
||||
/// When ORC/Parquet file says the type is "byte array" or "fixed len byte array",
|
||||
/// but the clickhouse query says to interpret the column as e.g. Int128, it
|
||||
/// may mean one of two things:
|
||||
/// * The byte array is the 16 bytes of Int128, little-endian.
|
||||
/// * The byte array is an ASCII string containing the Int128 formatted in base 10.
|
||||
/// There's no reliable way to distinguish these cases. We just guess: if the
|
||||
/// byte array is variable-length, and the length is different from sizeof(type),
|
||||
/// we parse as text, otherwise as binary.
|
||||
case TypeIndex::Int128:
|
||||
return readColumnWithBigNumberFromBinaryData<ColumnInt128>(arrow_column, column_name, type_hint);
|
||||
case TypeIndex::UInt128:
|
||||
|
@ -224,21 +224,18 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type
|
||||
if (data.empty())
|
||||
return Field();
|
||||
|
||||
/// Long integers.
|
||||
auto reinterpret_fixed_string = [&](auto x)
|
||||
{
|
||||
if (data.size() != sizeof(x))
|
||||
throw Exception(ErrorCodes::CANNOT_PARSE_NUMBER, "Unexpected {} size: {}", fieldTypeToString(Field::TypeToEnum<decltype(x)>::value), data.size());
|
||||
memcpy(&x, data.data(), data.size());
|
||||
return Field(x);
|
||||
};
|
||||
/// Long integers, encoded either as text or as little-endian bytes.
|
||||
/// The parquet file doesn't know that it's numbers, so the min/max are produced by comparing
|
||||
/// strings lexicographically. So these min and max are mostly useless to us.
|
||||
/// There's one case where they're not useless: min == max; currently we don't make use of this.
|
||||
switch (type_hint)
|
||||
{
|
||||
case TypeIndex::UInt128: return reinterpret_fixed_string(UInt128(0));
|
||||
case TypeIndex::UInt256: return reinterpret_fixed_string(UInt256(0));
|
||||
case TypeIndex::Int128: return reinterpret_fixed_string(Int128(0));
|
||||
case TypeIndex::Int256: return reinterpret_fixed_string(Int256(0));
|
||||
case TypeIndex::IPv6: return reinterpret_fixed_string(IPv6(0));
|
||||
case TypeIndex::UInt128:
|
||||
case TypeIndex::UInt256:
|
||||
case TypeIndex::Int128:
|
||||
case TypeIndex::Int256:
|
||||
case TypeIndex::IPv6:
|
||||
return Field();
|
||||
default: break;
|
||||
}
|
||||
|
||||
|
@ -1 +1,2 @@
|
||||
424242424242424242424242424242424242424242424242424242
|
||||
22707864971053448441042714569797161695738549521977760418632926980540162388532
|
||||
|
@ -5,5 +5,8 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
|
||||
# shellcheck source=../shell_config.sh
|
||||
. "$CUR_DIR"/../shell_config.sh
|
||||
|
||||
# This is parsed as text.
|
||||
$CLICKHOUSE_LOCAL -q "select toString(424242424242424242424242424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table"
|
||||
|
||||
# But this is parsed as binary because text length happens to be 32 bytes. Not ideal.
|
||||
$CLICKHOUSE_LOCAL -q "select toString(42424242424242424242424242424242::UInt256) as x format Parquet" | $CLICKHOUSE_LOCAL --input-format=Parquet --structure='x UInt256' -q "select * from table"
|
||||
|
Loading…
Reference in New Issue
Block a user