This commit is contained in:
Arthur Passos 2024-09-19 19:54:12 -03:00
parent 10718eb805
commit d017738a28
3 changed files with 48 additions and 26 deletions

View File

@ -321,6 +321,8 @@ bool isUInt8(TYPE data_type) { return WhichDataType(data_type).isUInt8(); } \
bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \ bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \
bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \ bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \
bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \ bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \
bool isUInt128(TYPE data_type) { return WhichDataType(data_type).isUInt128(); } \
bool isUInt256(TYPE data_type) { return WhichDataType(data_type).isUInt256(); } \
bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \ bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \
bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \ bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \
\ \
@ -328,6 +330,8 @@ bool isInt8(TYPE data_type) { return WhichDataType(data_type).isInt8(); } \
bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \ bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \
bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \ bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \
bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \ bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \
bool isInt128(TYPE data_type) { return WhichDataType(data_type).isInt128(); } \
bool isInt256(TYPE data_type) { return WhichDataType(data_type).isInt256(); } \
bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \ bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \
bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \ bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \
\ \

View File

@ -459,7 +459,9 @@ struct WhichDataType
bool isUInt8(TYPE data_type); \ bool isUInt8(TYPE data_type); \
bool isUInt16(TYPE data_type); \ bool isUInt16(TYPE data_type); \
bool isUInt32(TYPE data_type); \ bool isUInt32(TYPE data_type); \
bool isUInt64(TYPE data_type); \ bool isUInt64(TYPE data_type);\
bool isUInt128(TYPE data_type);\
bool isUInt256(TYPE data_type); \
bool isNativeUInt(TYPE data_type); \ bool isNativeUInt(TYPE data_type); \
bool isUInt(TYPE data_type); \ bool isUInt(TYPE data_type); \
\ \
@ -467,6 +469,8 @@ bool isInt8(TYPE data_type); \
bool isInt16(TYPE data_type); \ bool isInt16(TYPE data_type); \
bool isInt32(TYPE data_type); \ bool isInt32(TYPE data_type); \
bool isInt64(TYPE data_type); \ bool isInt64(TYPE data_type); \
bool isInt128(TYPE data_type); \
bool isInt256(TYPE data_type); \
bool isNativeInt(TYPE data_type); \ bool isNativeInt(TYPE data_type); \
bool isInt(TYPE data_type); \ bool isInt(TYPE data_type); \
\ \

View File

@ -24,21 +24,6 @@ namespace ErrorCodes
namespace namespace
{ {
// parquet::ByteArray createByteArray(std::string_view view, TypeIndex type, uint8_t * buffer, uint32_t buffer_size)
// {
// if (isStringOrFixedString(type))
// {
// return view;
// }
// else
// {
// auto size = static_cast<uint32_t>(std::max(view.size(), sizeof(uint32_t)));
// chassert(size <= buffer_size);
// std::copy(view.begin(), view.end(), buffer);
// return parquet::ByteArray(size, buffer);
// }
// }
template <typename IntegerType, typename ColumnType = IntegerType> template <typename IntegerType, typename ColumnType = IntegerType>
void hashInt(const IColumn * data_column, ColumnUInt64::Container & hashes_internal_data) void hashInt(const IColumn * data_column, ColumnUInt64::Container & hashes_internal_data)
{ {
@ -78,21 +63,22 @@ namespace
switch (clickhouse_type->getTypeId()) switch (clickhouse_type->getTypeId())
{ {
case TypeIndex::UInt8: case TypeIndex::UInt8:
hashInt<uint8_t>(data_column, hashes_internal_data); hashInt<int32_t, uint8_t>(data_column, hashes_internal_data);
break; break;
case TypeIndex::UInt16: case TypeIndex::UInt16:
hashInt<uint8_t>(data_column, hashes_internal_data); hashInt<int32_t, uint16_t>(data_column, hashes_internal_data);
break; break;
case TypeIndex::UInt32: case TypeIndex::UInt32:
hashInt<uint8_t>(data_column, hashes_internal_data); hashInt<int32_t, uint32_t>(data_column, hashes_internal_data);
break;
case TypeIndex::UInt64:
hashInt<int64_t, uint64_t>(data_column, hashes_internal_data);
break; break;
// case TypeIndex::UInt64:
// break;
case TypeIndex::Int8: case TypeIndex::Int8:
hashInt<int8_t>(data_column, hashes_internal_data); hashInt<int32_t, int8_t>(data_column, hashes_internal_data);
break; break;
case TypeIndex::Int16: case TypeIndex::Int16:
hashInt<int16_t>(data_column, hashes_internal_data); hashInt<int32_t, int16_t>(data_column, hashes_internal_data);
break; break;
case TypeIndex::Int32: case TypeIndex::Int32:
hashInt<int32_t>(data_column, hashes_internal_data); hashInt<int32_t>(data_column, hashes_internal_data);
@ -107,8 +93,6 @@ namespace
hashString<ColumnFixedString>(data_column, hashes_internal_data); hashString<ColumnFixedString>(data_column, hashes_internal_data);
break; break;
// case TypeIndex::IPv4: // case TypeIndex::IPv4:
// break;
// case TypeIndex::JSONPaths:
// break; // break;
default: default:
break; break;
@ -139,6 +123,17 @@ namespace
return match_all; return match_all;
} }
bool isClickHouseTypeCompatibleWithParquetIntegerType(const DataTypePtr clickhouse_type)
{
return isInteger(clickhouse_type) || isIPv4(clickhouse_type);
}
bool isClickHouseTypeCompatibleWithParquetByteType(const DataTypePtr clickhouse_type)
{
return isStringOrFixedString(clickhouse_type) || isIPv6(clickhouse_type)
|| isUInt128(clickhouse_type) || isUInt256(clickhouse_type);
}
bool isColumnSupported(const DataTypePtr clickhouse_type, const parquet::ColumnDescriptor * column_descriptor) bool isColumnSupported(const DataTypePtr clickhouse_type, const parquet::ColumnDescriptor * column_descriptor)
{ {
if (column_descriptor->converted_type() == parquet::ConvertedType::NONE && column_descriptor->logical_type() != nullptr) if (column_descriptor->converted_type() == parquet::ConvertedType::NONE && column_descriptor->logical_type() != nullptr)
@ -159,11 +154,16 @@ namespace
if (physical_type == parquet::Type::type::INT32 || physical_type == parquet::Type::type::INT64) if (physical_type == parquet::Type::type::INT32 || physical_type == parquet::Type::type::INT64)
{ {
// branching with false and true is weird // branching with false and true is weird
if (!isInteger(clickhouse_type) && !(isIPv4(clickhouse_type))) if (!isClickHouseTypeCompatibleWithParquetIntegerType(clickhouse_type))
{ {
return false; return false;
} }
if (!logical_type && parquet::ConvertedType::type::NONE == converted_type)
{
return true;
}
if (logical_type && logical_type->is_int()) if (logical_type && logical_type->is_int())
{ {
return true; return true;
@ -179,6 +179,17 @@ namespace
} }
else if (physical_type == parquet::Type::type::BYTE_ARRAY || physical_type == parquet::Type::type::FIXED_LEN_BYTE_ARRAY) else if (physical_type == parquet::Type::type::BYTE_ARRAY || physical_type == parquet::Type::type::FIXED_LEN_BYTE_ARRAY)
{ {
// branching with false and true is weird
if (!isClickHouseTypeCompatibleWithParquetByteType(clickhouse_type))
{
return false;
}
if (!logical_type && parquet::ConvertedType::type::NONE == converted_type)
{
return true;
}
if (logical_type && (logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON())) if (logical_type && (logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON()))
{ {
return true; return true;
@ -315,6 +326,9 @@ std::vector<ParquetBloomFilterCondition::ConditionElement> keyConditionRPNToParq
using RPNElement = KeyCondition::RPNElement; using RPNElement = KeyCondition::RPNElement;
using Function = ParquetBloomFilterCondition::ConditionElement::Function; using Function = ParquetBloomFilterCondition::ConditionElement::Function;
// todo arthur
// where toIPv4(uint32_col) = ...
// results in function unknown..
for (const auto & rpn_element : rpn) for (const auto & rpn_element : rpn)
{ {
Columns columns; Columns columns;