From d017738a28fd374b5b227243812e6c8fedbf92cf Mon Sep 17 00:00:00 2001 From: Arthur Passos Date: Thu, 19 Sep 2024 19:54:12 -0300 Subject: [PATCH] progress --- src/DataTypes/IDataType.cpp | 4 ++ src/DataTypes/IDataType.h | 6 +- .../Parquet/ParquetBloomFilterCondition.cpp | 64 +++++++++++-------- 3 files changed, 48 insertions(+), 26 deletions(-) diff --git a/src/DataTypes/IDataType.cpp b/src/DataTypes/IDataType.cpp index 945d36dbb92..7165ad18999 100644 --- a/src/DataTypes/IDataType.cpp +++ b/src/DataTypes/IDataType.cpp @@ -321,6 +321,8 @@ bool isUInt8(TYPE data_type) { return WhichDataType(data_type).isUInt8(); } \ bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \ bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \ bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \ +bool isUInt128(TYPE data_type) { return WhichDataType(data_type).isUInt128(); } \ +bool isUInt256(TYPE data_type) { return WhichDataType(data_type).isUInt256(); } \ bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \ bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \ \ @@ -328,6 +330,8 @@ bool isInt8(TYPE data_type) { return WhichDataType(data_type).isInt8(); } \ bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \ bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \ bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \ +bool isInt128(TYPE data_type) { return WhichDataType(data_type).isInt128(); } \ +bool isInt256(TYPE data_type) { return WhichDataType(data_type).isInt256(); } \ bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \ bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \ \ diff --git a/src/DataTypes/IDataType.h b/src/DataTypes/IDataType.h index a7665e610ab..2daaaf3caf9 100644 --- a/src/DataTypes/IDataType.h +++ b/src/DataTypes/IDataType.h @@ -459,7 +459,9 @@ struct WhichDataType bool isUInt8(TYPE data_type); \ bool isUInt16(TYPE data_type); \ bool isUInt32(TYPE data_type); \ -bool isUInt64(TYPE data_type); \ +bool isUInt64(TYPE data_type);\ +bool isUInt128(TYPE data_type);\ +bool isUInt256(TYPE data_type); \ bool isNativeUInt(TYPE data_type); \ bool isUInt(TYPE data_type); \ \ @@ -467,6 +469,8 @@ bool isInt8(TYPE data_type); \ bool isInt16(TYPE data_type); \ bool isInt32(TYPE data_type); \ bool isInt64(TYPE data_type); \ +bool isInt128(TYPE data_type); \ +bool isInt256(TYPE data_type); \ bool isNativeInt(TYPE data_type); \ bool isInt(TYPE data_type); \ \ diff --git a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp b/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp index 39434834a23..7e4a13cd979 100644 --- a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp +++ b/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp @@ -24,21 +24,6 @@ namespace ErrorCodes namespace { -// parquet::ByteArray createByteArray(std::string_view view, TypeIndex type, uint8_t * buffer, uint32_t buffer_size) -// { -// if (isStringOrFixedString(type)) -// { -// return view; -// } -// else -// { -// auto size = static_cast(std::max(view.size(), sizeof(uint32_t))); -// chassert(size <= buffer_size); -// std::copy(view.begin(), view.end(), buffer); -// return parquet::ByteArray(size, buffer); -// } -// } - template void hashInt(const IColumn * data_column, ColumnUInt64::Container & hashes_internal_data) { @@ -78,21 +63,22 @@ namespace switch (clickhouse_type->getTypeId()) { case TypeIndex::UInt8: - hashInt(data_column, hashes_internal_data); + hashInt(data_column, hashes_internal_data); break; case TypeIndex::UInt16: - hashInt(data_column, hashes_internal_data); + hashInt(data_column, hashes_internal_data); break; case TypeIndex::UInt32: - hashInt(data_column, hashes_internal_data); + hashInt(data_column, hashes_internal_data); + break; + case TypeIndex::UInt64: + hashInt(data_column, hashes_internal_data); break; -// case TypeIndex::UInt64: -// break; case TypeIndex::Int8: - hashInt(data_column, hashes_internal_data); + hashInt(data_column, hashes_internal_data); break; case TypeIndex::Int16: - hashInt(data_column, hashes_internal_data); + hashInt(data_column, hashes_internal_data); break; case TypeIndex::Int32: hashInt(data_column, hashes_internal_data); @@ -107,8 +93,6 @@ namespace hashString(data_column, hashes_internal_data); break; // case TypeIndex::IPv4: -// break; -// case TypeIndex::JSONPaths: // break; default: break; @@ -139,6 +123,17 @@ namespace return match_all; } + bool isClickHouseTypeCompatibleWithParquetIntegerType(const DataTypePtr clickhouse_type) + { + return isInteger(clickhouse_type) || isIPv4(clickhouse_type); + } + + bool isClickHouseTypeCompatibleWithParquetByteType(const DataTypePtr clickhouse_type) + { + return isStringOrFixedString(clickhouse_type) || isIPv6(clickhouse_type) + || isUInt128(clickhouse_type) || isUInt256(clickhouse_type); + } + bool isColumnSupported(const DataTypePtr clickhouse_type, const parquet::ColumnDescriptor * column_descriptor) { if (column_descriptor->converted_type() == parquet::ConvertedType::NONE && column_descriptor->logical_type() != nullptr) @@ -159,11 +154,16 @@ namespace if (physical_type == parquet::Type::type::INT32 || physical_type == parquet::Type::type::INT64) { // branching with false and true is weird - if (!isInteger(clickhouse_type) && !(isIPv4(clickhouse_type))) + if (!isClickHouseTypeCompatibleWithParquetIntegerType(clickhouse_type)) { return false; } + if (!logical_type && parquet::ConvertedType::type::NONE == converted_type) + { + return true; + } + if (logical_type && logical_type->is_int()) { return true; @@ -179,6 +179,17 @@ namespace } else if (physical_type == parquet::Type::type::BYTE_ARRAY || physical_type == parquet::Type::type::FIXED_LEN_BYTE_ARRAY) { + // branching with false and true is weird + if (!isClickHouseTypeCompatibleWithParquetByteType(clickhouse_type)) + { + return false; + } + + if (!logical_type && parquet::ConvertedType::type::NONE == converted_type) + { + return true; + } + if (logical_type && (logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON())) { return true; @@ -315,6 +326,9 @@ std::vector keyConditionRPNToParq using RPNElement = KeyCondition::RPNElement; using Function = ParquetBloomFilterCondition::ConditionElement::Function; + // todo arthur + // where toIPv4(uint32_col) = ... + // results in function unknown.. for (const auto & rpn_element : rpn) { Columns columns;