Merge remote-tracking branch 'ClickHouse/master' into query-time-ef-search

This commit is contained in:
Robert Schulze 2024-10-16 09:43:28 +00:00
commit 4b6b152562
No known key found for this signature in database
GPG Key ID: 26703B55FB13728A
43 changed files with 1664 additions and 52 deletions

View File

@ -1,4 +1,21 @@
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=0") # More checks in debug build.
if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
# Enable libcxx debug mode: https://releases.llvm.org/15.0.0/projects/libcxx/docs/DesignDocs/DebugMode.html
# The docs say the debug mode violates complexity guarantees, so do this only for Debug builds.
# set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_ENABLE_DEBUG_MODE=1")
# ^^ Crashes the database upon startup, needs investigation.
# Besides that, the implementation looks like a poor man's MSAN specific to libcxx. Since CI tests MSAN
# anyways, we can keep the debug mode disabled.
# Libcxx also provides extra assertions:
# --> https://releases.llvm.org/15.0.0/projects/libcxx/docs/UsingLibcxx.html#assertions-mode
# These look orthogonal to the debug mode but the debug mode enables them implicitly:
# --> https://github.com/llvm/llvm-project/blob/release/15.x/libcxx/include/__assert#L29
# They are cheap and straightforward, so enable them in debug builds:
set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_ENABLE_ASSERTIONS=1")
# TODO Once we upgrade to LLVM 18+, reconsider all of the above as they introduced "hardening modes":
# https://libcxx.llvm.org/Hardening.html
endif ()
add_subdirectory(contrib/libcxxabi-cmake)
add_subdirectory(contrib/libcxx-cmake)

View File

@ -1,6 +1,9 @@
set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp")
set(ABSL_COMMON_INCLUDE_DIRS "${ABSL_ROOT_DIR}")
# To avoid errors "'X' does not refer to a value" while using `offsetof` function.
set(CMAKE_CXX_STANDARD 17)
# This is a minimized version of the function definition in CMake/AbseilHelpers.cmake
#

View File

@ -5,6 +5,9 @@ if(NOT ENABLE_PROTOBUF)
return()
endif()
# To avoid errors "'X' does not refer to a value" while using `offsetof` function.
set(CMAKE_CXX_STANDARD 17)
set(Protobuf_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/google-protobuf/src")
if(OS_FREEBSD AND SANITIZE STREQUAL "address")
# ../contrib/protobuf/src/google/protobuf/arena_impl.h:45:10: fatal error: 'sanitizer/asan_interface.h' file not found

View File

@ -6,6 +6,8 @@ if(NOT ENABLE_GRPC)
return()
endif()
set(CMAKE_CXX_STANDARD 17)
set(_gRPC_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/grpc")
set(_gRPC_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/grpc")

View File

@ -22,7 +22,7 @@
# limitations under the License.
# We want to use C++23, but GRPC is not ready
set (CMAKE_CXX_STANDARD 20)
set (CMAKE_CXX_STANDARD 17)
set(_gRPC_ZLIB_INCLUDE_DIR "")
set(_gRPC_ZLIB_LIBRARIES ch_contrib::zlib)

View File

@ -2933,7 +2933,42 @@ The same as today() - 1.
## timeSlot
Rounds the time to the half hour.
Round the time to the start of a half-an-hour length interval.
**Syntax**
```sql
timeSlot(time[, time_zone])
```
**Arguments**
- `time` — Time to round to the start of a half-an-hour length interval. [DateTime](../data-types/datetime.md)/[Date32](../data-types/date32.md)/[DateTime64](../data-types/datetime64.md).
- `time_zone` — A String type const value or an expression representing the time zone. [String](../data-types/string.md).
:::note
Though this function can take values of the extended types `Date32` and `DateTime64` as an argument, passing it a time outside the normal range (year 1970 to 2149 for `Date` / 2106 for `DateTime`) will produce wrong results.
:::
**Return type**
- Returns the time rounded to the start of a half-an-hour length interval. [DateTime](../data-types/datetime.md).
**Example**
Query:
```sql
SELECT timeSlot(toDateTime('2000-01-02 03:04:05', 'UTC'));
```
Result:
```response
┌─timeSlot(toDateTime('2000-01-02 03:04:05', 'UTC'))─┐
│ 2000-01-02 03:00:00 │
└────────────────────────────────────────────────────┘
```
## toYYYYMM

View File

@ -31,7 +31,7 @@ std::string RemoteProxyHostFetcherImpl::fetch(const Poco::URI & endpoint, const
endpoint.toString(),
response.getStatus(),
response.getReason(),
"");
/* body_length = */ 0);
std::string proxy_host;
Poco::StreamCopier::copyToString(response_body_stream, proxy_host);

View File

@ -25,15 +25,11 @@ namespace
* `curl` strips leading dot and accepts url gitlab.com as a match for no_proxy .gitlab.com,
* while `wget` does an exact match.
* */
std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host)
std::string buildPocoRegexpEntryWithoutLeadingDot(std::string_view host)
{
std::string_view view_without_leading_dot = host;
if (host[0] == '.')
{
view_without_leading_dot = std::string_view {host.begin() + 1u, host.end()};
}
return RE2::QuoteMeta(view_without_leading_dot);
if (host.starts_with('.'))
host.remove_prefix(1);
return RE2::QuoteMeta(host);
}
}

View File

@ -170,6 +170,9 @@ Avoid reordering rows when reading from Parquet files. Usually makes it much slo
)", 0) \
M(Bool, input_format_parquet_filter_push_down, true, R"(
When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.
)", 0) \
M(Bool, input_format_parquet_bloom_filter_push_down, false, R"(
When reading Parquet files, skip whole row groups based on the WHERE expressions and bloom filter in the Parquet metadata.
)", 0) \
M(Bool, input_format_parquet_use_native_reader, false, R"(
When reading Parquet files, to use native reader instead of arrow reader.

View File

@ -102,6 +102,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
{"allow_experimental_refreshable_materialized_view", false, true, "Not experimental anymore"},
{"max_parts_to_move", 1000, 1000, "New setting"},
{"hnsw_candidate_list_size_for_search", 0, 0, "New setting"},
{"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
}
},
{"24.9",

View File

@ -36,8 +36,8 @@ public:
auto findByValue(const T & value) const
{
const auto it = value_to_name_map.find(value);
if (it == std::end(value_to_name_map))
auto it = value_to_name_map.find(value);
if (it == value_to_name_map.end())
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected value {} in enum", toString(value));
return it;
@ -58,7 +58,7 @@ public:
bool getNameForValue(const T & value, StringRef & result) const
{
const auto it = value_to_name_map.find(value);
if (it == std::end(value_to_name_map))
if (it == value_to_name_map.end())
return false;
result = it->second;

View File

@ -321,6 +321,8 @@ bool isUInt8(TYPE data_type) { return WhichDataType(data_type).isUInt8(); } \
bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \
bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \
bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \
bool isUInt128(TYPE data_type) { return WhichDataType(data_type).isUInt128(); } \
bool isUInt256(TYPE data_type) { return WhichDataType(data_type).isUInt256(); } \
bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \
bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \
\
@ -328,6 +330,8 @@ bool isInt8(TYPE data_type) { return WhichDataType(data_type).isInt8(); } \
bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \
bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \
bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \
bool isInt128(TYPE data_type) { return WhichDataType(data_type).isInt128(); } \
bool isInt256(TYPE data_type) { return WhichDataType(data_type).isInt256(); } \
bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \
bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \
\

View File

@ -458,6 +458,8 @@ bool isUInt8(TYPE data_type); \
bool isUInt16(TYPE data_type); \
bool isUInt32(TYPE data_type); \
bool isUInt64(TYPE data_type);\
bool isUInt128(TYPE data_type);\
bool isUInt256(TYPE data_type); \
bool isNativeUInt(TYPE data_type); \
bool isUInt(TYPE data_type); \
\
@ -465,6 +467,8 @@ bool isInt8(TYPE data_type); \
bool isInt16(TYPE data_type); \
bool isInt32(TYPE data_type); \
bool isInt64(TYPE data_type); \
bool isInt128(TYPE data_type); \
bool isInt256(TYPE data_type); \
bool isNativeInt(TYPE data_type); \
bool isInt(TYPE data_type); \
\

View File

@ -111,9 +111,9 @@ std::future<IAsynchronousReader::Result> ThreadPoolReader::submit(Request reques
/// RWF_NOWAIT flag may return 0 even when not at end of file.
/// It can't be distinguished from the real eof, so we have to
/// disable pread with nowait.
static std::atomic<bool> has_pread_nowait_support = !hasBugInPreadV2();
static const bool has_pread_nowait_support = !hasBugInPreadV2();
if (has_pread_nowait_support.load(std::memory_order_relaxed))
if (has_pread_nowait_support)
{
/// It reports real time spent including the time spent while thread was preempted doing nothing.
/// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables).
@ -161,7 +161,8 @@ std::future<IAsynchronousReader::Result> ThreadPoolReader::submit(Request reques
if (errno == ENOSYS || errno == EOPNOTSUPP)
{
/// No support for the syscall or the flag in the Linux kernel.
has_pread_nowait_support.store(false, std::memory_order_relaxed);
/// It shouldn't happen because we check the kernel version but let's
/// fallback to the thread pool.
break;
}
if (errno == EAGAIN)

View File

@ -92,12 +92,26 @@ std::unique_ptr<S3::Client> getClient(
"Region should be explicitly specified for directory buckets");
}
const Settings & local_settings = context->getSettingsRef();
int s3_max_redirects = static_cast<int>(global_settings[Setting::s3_max_redirects]);
if (!for_disk_s3 && local_settings.isChanged("s3_max_redirects"))
s3_max_redirects = static_cast<int>(local_settings[Setting::s3_max_redirects]);
int s3_retry_attempts = static_cast<int>(global_settings[Setting::s3_retry_attempts]);
if (!for_disk_s3 && local_settings.isChanged("s3_retry_attempts"))
s3_retry_attempts = static_cast<int>(local_settings[Setting::s3_retry_attempts]);
bool enable_s3_requests_logging = global_settings[Setting::enable_s3_requests_logging];
if (!for_disk_s3 && local_settings.isChanged("enable_s3_requests_logging"))
enable_s3_requests_logging = local_settings[Setting::enable_s3_requests_logging];
S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
auth_settings.region,
context->getRemoteHostFilter(),
static_cast<int>(global_settings[Setting::s3_max_redirects]),
static_cast<int>(global_settings[Setting::s3_retry_attempts]),
global_settings[Setting::enable_s3_requests_logging],
s3_max_redirects,
s3_retry_attempts,
enable_s3_requests_logging,
for_disk_s3,
request_settings.get_request_throttler,
request_settings.put_request_throttler,

View File

@ -191,6 +191,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
format_settings.parquet.case_insensitive_column_matching = settings[Setting::input_format_parquet_case_insensitive_column_matching];
format_settings.parquet.preserve_order = settings[Setting::input_format_parquet_preserve_order];
format_settings.parquet.filter_push_down = settings[Setting::input_format_parquet_filter_push_down];
format_settings.parquet.bloom_filter_push_down = settings[Setting::input_format_parquet_bloom_filter_push_down];
format_settings.parquet.use_native_reader = settings[Setting::input_format_parquet_use_native_reader];
format_settings.parquet.allow_missing_columns = settings[Setting::input_format_parquet_allow_missing_columns];
format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings[Setting::input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference];

View File

@ -274,6 +274,7 @@ struct FormatSettings
bool skip_columns_with_unsupported_types_in_schema_inference = false;
bool case_insensitive_column_matching = false;
bool filter_push_down = true;
bool bloom_filter_push_down = true;
bool use_native_reader = false;
std::unordered_set<int> skip_row_groups = {};
bool output_string_as_string = false;

View File

@ -84,11 +84,9 @@ void assertResponseIsOk(const String & uri, Poco::Net::HTTPResponse & response,
? ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS
: ErrorCodes::RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;
std::stringstream body; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
body.exceptions(std::ios::failbit);
body << istr.rdbuf();
throw HTTPException(code, uri, status, response.getReason(), body.str());
istr.seekg(0, std::ios::end);
size_t body_length = istr.tellg();
throw HTTPException(code, uri, status, response.getReason(), body_length);
}
}
@ -97,13 +95,13 @@ Exception HTTPException::makeExceptionMessage(
const std::string & uri,
Poco::Net::HTTPResponse::HTTPStatus http_status,
const std::string & reason,
const std::string & body)
size_t body_length)
{
return Exception(code,
"Received error from remote server {}. "
"HTTP status code: {} {}, "
"body: {}",
uri, static_cast<int>(http_status), reason, body);
"HTTP status code: {} '{}', "
"body length: {} bytes",
uri, static_cast<int>(http_status), reason, body_length);
}
}

View File

@ -27,9 +27,9 @@ public:
const std::string & uri,
Poco::Net::HTTPResponse::HTTPStatus http_status_,
const std::string & reason,
const std::string & body
size_t body_length = 0
)
: Exception(makeExceptionMessage(code, uri, http_status_, reason, body))
: Exception(makeExceptionMessage(code, uri, http_status_, reason, body_length))
, http_status(http_status_)
{}
@ -46,7 +46,7 @@ private:
const std::string & uri,
Poco::Net::HTTPResponse::HTTPStatus http_status,
const std::string & reason,
const std::string & body);
size_t body_length);
const char * name() const noexcept override { return "DB::HTTPException"; }
const char * className() const noexcept override { return "DB::HTTPException"; }

View File

@ -423,8 +423,7 @@ std::unique_ptr<ReadBuffer> ReadWriteBufferFromHTTP::initialize()
ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
current_uri.toString(),
Poco::Net::HTTPResponse::HTTP_REQUESTED_RANGE_NOT_SATISFIABLE,
reason,
"");
reason);
}
throw Exception(
ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
@ -549,8 +548,7 @@ size_t ReadWriteBufferFromHTTP::readBigAt(char * to, size_t n, size_t offset, co
ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
current_uri.toString(),
Poco::Net::HTTPResponse::HTTP_REQUESTED_RANGE_NOT_SATISFIABLE,
reason,
"");
reason);
}
copyFromIStreamWithProgressCallback(*result.response_stream, to, n, progress_callback, &bytes_copied, &is_canceled);

View File

@ -238,6 +238,8 @@ public:
const Columns & getOrderedSet() const { return ordered_set; }
const std::vector<KeyTuplePositionMapping> & getIndexesMapping() const { return indexes_mapping; }
private:
// If all arguments in tuple are key columns, we can optimize NOT IN when there is only one element.
bool has_all_keys;

View File

@ -274,6 +274,7 @@ size_t IRowInputFormat::countRows(size_t)
void IRowInputFormat::setSerializationHints(const SerializationInfoByName & hints)
{
if (supportsCustomSerializations())
serializations = getPort().getHeader().getSerializations(hints);
}

View File

@ -59,6 +59,7 @@ protected:
/// `max_block_size` can be ignored.
virtual size_t countRows(size_t max_block_size);
virtual bool supportsCountRows() const { return false; }
virtual bool supportsCustomSerializations() const { return false; }
virtual void readPrefix() {} /// delimiter before begin of result
virtual void readSuffix() {} /// delimiter after end of result

View File

@ -15,6 +15,7 @@
#include <arrow/type_fwd.h>
#include <boost/algorithm/string/case_conv.hpp>
#include <Common/Exception.h>
#include <parquet/metadata.h>
namespace arrow
@ -65,11 +66,22 @@ public:
return result;
}
// For a parquet schema {x: {i: int, j: int}}, this should be populated as follows
// clickhouse_index = 0, parquet_indexes = {0, 1}
struct ClickHouseIndexToParquetIndex
{
std::size_t clickhouse_index;
std::vector<int> parquet_indexes;
};
/// Only collect the required fields' indices. Eg. when just read a field of a struct,
/// don't need to collect the whole indices in this struct.
std::vector<int> findRequiredIndices(const Block & header, const arrow::Schema & schema)
std::vector<ClickHouseIndexToParquetIndex> findRequiredIndices(
const Block & header,
const arrow::Schema & schema,
const parquet::FileMetaData & file)
{
std::vector<int> required_indices;
std::vector<ClickHouseIndexToParquetIndex> required_indices;
std::unordered_set<int> added_indices;
/// Flat all named fields' index information into a map.
auto fields_indices = calculateFieldIndices(schema);
@ -79,7 +91,7 @@ public:
std::string col_name = named_col.name;
if (ignore_case)
boost::to_lower(col_name);
findRequiredIndices(col_name, named_col.type, fields_indices, added_indices, required_indices);
findRequiredIndices(col_name, i, named_col.type, fields_indices, added_indices, required_indices, file);
}
return required_indices;
}
@ -169,10 +181,12 @@ private:
void findRequiredIndices(
const String & name,
std::size_t header_index,
DataTypePtr data_type,
const std::unordered_map<std::string, std::pair<int, int>> & field_indices,
std::unordered_set<int> & added_indices,
std::vector<int> & required_indices)
std::vector<ClickHouseIndexToParquetIndex> & required_indices,
const parquet::FileMetaData & file)
{
auto nested_type = removeNullable(data_type);
if (const DB::DataTypeTuple * type_tuple = typeid_cast<const DB::DataTypeTuple *>(nested_type.get()))
@ -187,20 +201,20 @@ private:
if (ignore_case)
boost::to_lower(field_name);
const auto & field_type = field_types[i];
findRequiredIndices(Nested::concatenateName(name, field_name), field_type, field_indices, added_indices, required_indices);
findRequiredIndices(Nested::concatenateName(name, field_name), header_index, field_type, field_indices, added_indices, required_indices, file);
}
return;
}
}
else if (const auto * type_array = typeid_cast<const DB::DataTypeArray *>(nested_type.get()))
{
findRequiredIndices(name, type_array->getNestedType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, header_index, type_array->getNestedType(), field_indices, added_indices, required_indices, file);
return;
}
else if (const auto * type_map = typeid_cast<const DB::DataTypeMap *>(nested_type.get()))
{
findRequiredIndices(name, type_map->getKeyType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, type_map->getValueType(), field_indices, added_indices, required_indices);
findRequiredIndices(name, header_index, type_map->getKeyType(), field_indices, added_indices, required_indices, file);
findRequiredIndices(name, header_index, type_map->getValueType(), field_indices, added_indices, required_indices, file);
return;
}
auto it = field_indices.find(name);
@ -211,14 +225,18 @@ private:
}
else
{
ClickHouseIndexToParquetIndex index_mapping;
index_mapping.clickhouse_index = header_index;
for (int j = 0; j < it->second.second; ++j)
{
auto index = it->second.first + j;
if (added_indices.insert(index).second)
{
required_indices.emplace_back(index);
index_mapping.parquet_indexes.emplace_back(index);
}
}
required_indices.emplace_back(index_mapping);
}
}
};

View File

@ -43,6 +43,7 @@ private:
size_t countRows(size_t max_block_size) override;
bool supportsCountRows() const override { return true; }
bool supportsCustomSerializations() const override { return true; }
const String & columnName(size_t i) const;
size_t columnIndex(StringRef name, size_t key_index);

View File

@ -0,0 +1,525 @@
#include <Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h>
#include <iostream>
#if USE_PARQUET
#include <parquet/bloom_filter.h>
#include <parquet/xxhasher.h>
#include <Interpreters/convertFieldToType.h>
#include <Columns/ColumnConst.h>
namespace DB
{
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
namespace
{
bool isParquetStringTypeSupportedForBloomFilters(
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type)
{
if (logical_type &&
!logical_type->is_none()
&& !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON()))
{
return false;
}
if (parquet::ConvertedType::type::NONE != converted_type &&
!(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8
|| converted_type == parquet::ConvertedType::BSON))
{
return false;
}
return true;
}
bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
{
if (logical_type && !logical_type->is_none() && !logical_type->is_int())
{
return false;
}
if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16
|| converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64
|| converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16
|| converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64))
{
return false;
}
return true;
}
template <typename T>
uint64_t hashSpecialFLBATypes(const Field & field)
{
const T & value = field.safeGet<T>();
parquet::FLBA flba(reinterpret_cast<const uint8_t*>(&value));
parquet::XxHasher hasher;
return hasher.Hash(&flba, sizeof(T));
};
std::optional<uint64_t> tryHashStringWithoutCompatibilityCheck(const Field & field)
{
const auto field_type = field.getType();
if (field_type != Field::Types::Which::String)
{
return std::nullopt;
}
parquet::XxHasher hasher;
parquet::ByteArray ba { field.safeGet<std::string>() };
return hasher.Hash(&ba);
}
std::optional<uint64_t> tryHashString(
const Field & field,
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type)
{
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}
return tryHashStringWithoutCompatibilityCheck(field);
}
std::optional<uint64_t> tryHashFLBA(
const Field & field,
const std::shared_ptr<const parquet::LogicalType> & logical_type,
parquet::ConvertedType::type converted_type,
std::size_t parquet_column_length)
{
if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}
const auto field_type = field.getType();
if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6))
{
return hashSpecialFLBATypes<IPv6>(field);
}
return tryHashStringWithoutCompatibilityCheck(field);
}
template <typename ParquetPhysicalType>
std::optional<uint64_t> tryHashInt(const Field & field, const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
{
if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type))
{
return std::nullopt;
}
parquet::XxHasher hasher;
if (field.getType() == Field::Types::Which::Int64)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<int64_t>()));
}
else if (field.getType() == Field::Types::Which::UInt64)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<uint64_t>()));
}
else if (field.getType() == Field::Types::IPv4)
{
/*
* In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api
* with a zero-ed buffer that had a 32 bits variable copied into it.
*
* To be on the safe side, accept only in case physical type is 32 bits.
* */
if constexpr (std::is_same_v<int32_t, ParquetPhysicalType>)
{
return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<IPv4>()));
}
}
return std::nullopt;
}
std::optional<uint64_t> tryHash(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor)
{
const auto physical_type = parquet_column_descriptor->physical_type();
const auto & logical_type = parquet_column_descriptor->logical_type();
const auto converted_type = parquet_column_descriptor->converted_type();
switch (physical_type)
{
case parquet::Type::type::INT32:
return tryHashInt<int32_t>(field, logical_type, converted_type);
case parquet::Type::type::INT64:
return tryHashInt<int64_t>(field, logical_type, converted_type);
case parquet::Type::type::BYTE_ARRAY:
return tryHashString(field, logical_type, converted_type);
case parquet::Type::type::FIXED_LEN_BYTE_ARRAY:
return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length());
default:
return std::nullopt;
}
}
std::optional<std::vector<uint64_t>> hash(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor)
{
std::vector<uint64_t> hashes;
for (size_t i = 0u; i < data_column->size(); i++)
{
Field f;
data_column->get(i, f);
auto hashed_value = tryHash(f, parquet_column_descriptor);
if (!hashed_value)
{
return std::nullopt;
}
hashes.emplace_back(*hashed_value);
}
return hashes;
}
bool maybeTrueOnBloomFilter(const std::vector<uint64_t> & hashes, const std::unique_ptr<parquet::BloomFilter> & bloom_filter)
{
for (const auto hash : hashes)
{
if (bloom_filter->FindHash(hash))
{
return true;
}
}
return false;
}
const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent(
const std::unique_ptr<parquet::RowGroupMetaData> & parquet_rg_metadata,
const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
std::size_t clickhouse_column_index)
{
if (clickhouse_column_index_to_parquet_index.size() <= clickhouse_column_index)
{
return nullptr;
}
const auto & parquet_indexes = clickhouse_column_index_to_parquet_index[clickhouse_column_index].parquet_indexes;
// complex types like structs, tuples and maps will have more than one index.
// we don't support those for now
if (parquet_indexes.size() > 1)
{
return nullptr;
}
if (parquet_indexes.empty())
{
throw Exception(ErrorCodes::LOGICAL_ERROR, "Something bad happened, raise an issue and try the query with `input_format_parquet_bloom_filter_push_down=false`");
}
auto parquet_column_index = parquet_indexes[0];
const auto * parquet_column_descriptor = parquet_rg_metadata->schema()->Column(parquet_column_index);
bool column_has_bloom_filter = parquet_rg_metadata->ColumnChunk(parquet_column_index)->bloom_filter_offset().has_value();
if (!column_has_bloom_filter)
{
return nullptr;
}
return parquet_column_descriptor;
}
}
ParquetBloomFilterCondition::ParquetBloomFilterCondition(const std::vector<ConditionElement> & condition_, const Block & header_)
: condition(condition_), header(header_)
{
}
bool ParquetBloomFilterCondition::mayBeTrueOnRowGroup(const ColumnIndexToBF & column_index_to_column_bf) const
{
using Function = ConditionElement::Function;
std::vector<BoolMask> rpn_stack;
for (const auto & element : condition)
{
if (element.function == Function::FUNCTION_IN
|| element.function == Function::FUNCTION_NOT_IN)
{
bool maybe_true = true;
for (auto column_index = 0u; column_index < element.hashes_per_column.size(); column_index++)
{
// in case bloom filter is not present for this row group
// https://github.com/ClickHouse/ClickHouse/pull/62966#discussion_r1722361237
if (!column_index_to_column_bf.contains(element.key_columns[column_index]))
{
rpn_stack.emplace_back(true, true);
continue;
}
bool column_maybe_contains = maybeTrueOnBloomFilter(
element.hashes_per_column[column_index],
column_index_to_column_bf.at(element.key_columns[column_index]));
if (!column_maybe_contains)
{
maybe_true = false;
break;
}
}
rpn_stack.emplace_back(maybe_true, true);
if (element.function == Function::FUNCTION_NOT_IN)
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == Function::FUNCTION_NOT)
{
rpn_stack.back() = !rpn_stack.back();
}
else if (element.function == Function::FUNCTION_OR)
{
auto arg1 = rpn_stack.back();
rpn_stack.pop_back();
auto arg2 = rpn_stack.back();
rpn_stack.back() = arg1 | arg2;
}
else if (element.function == Function::FUNCTION_AND)
{
auto arg1 = rpn_stack.back();
rpn_stack.pop_back();
auto arg2 = rpn_stack.back();
rpn_stack.back() = arg1 & arg2;
}
else if (element.function == Function::ALWAYS_TRUE)
{
rpn_stack.emplace_back(true, false);
}
else if (element.function == Function::ALWAYS_FALSE)
{
rpn_stack.emplace_back(false, true);
}
else
{
rpn_stack.emplace_back(true, true);
}
}
if (rpn_stack.size() != 1)
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::mayBeTrueOnRowGroup");
return rpn_stack[0].can_be_true;
}
std::unordered_set<std::size_t> ParquetBloomFilterCondition::getFilteringColumnKeys() const
{
std::unordered_set<std::size_t> column_keys;
for (const auto & element : condition)
{
for (const auto index : element.key_columns)
{
column_keys.insert(index);
}
}
return column_keys;
}
/*
* `KeyCondition::rpn` is overly complex for bloom filters, some operations are not even supported. Not only that, but to avoid hashing each time
* we loop over a rpn element, we need to store hashes instead of where predicate values. To address this, we loop over `KeyCondition::rpn`
* and build a simplified RPN that holds hashes instead of values.
*
* `KeyCondition::RPNElement::FUNCTION_IN_RANGE` becomes:
* `FUNCTION_IN`
* `FUNCTION_UNKNOWN` when range limits are different
* `KeyCondition::RPNElement::FUNCTION_IN_SET` becomes
* `FUNCTION_IN`
*
* Complex types and structs are not supported.
* There are two sources of data types being analyzed, and they need to be compatible: DB::Field type and parquet type.
* This is determined by the `isColumnSupported` method.
*
* Some interesting examples:
* 1. file(..., 'str_column UInt64') where str_column = 50; Field.type == UInt64. Parquet type string. Not supported.
* 2. file(...) where str_column = 50; Field.type == String (conversion already taken care by `KeyCondition`). Parquet type string.
* 3. file(...) where uint32_column = toIPv4(5). Field.type == IPv4. Incompatible column types, resolved by `KeyCondition` itself.
* 4. file(...) where toIPv4(uint32_column) = toIPv4(5). Field.type == IPv4. We know it is safe to hash it using an int32 API.
* */
std::vector<ParquetBloomFilterCondition::ConditionElement> keyConditionRPNToParquetBloomFilterCondition(
const std::vector<KeyCondition::RPNElement> & rpn,
const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
const std::unique_ptr<parquet::RowGroupMetaData> & parquet_rg_metadata)
{
std::vector<ParquetBloomFilterCondition::ConditionElement> condition_elements;
using RPNElement = KeyCondition::RPNElement;
using Function = ParquetBloomFilterCondition::ConditionElement::Function;
for (const auto & rpn_element : rpn)
{
// this would be a problem for `where negate(x) = -58`.
// It would perform a bf search on `-58`, and possibly miss row groups containing this data.
if (!rpn_element.monotonic_functions_chain.empty())
{
condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
continue;
}
ParquetBloomFilterCondition::ConditionElement::HashesForColumns hashes;
if (rpn_element.function == RPNElement::FUNCTION_IN_RANGE
|| rpn_element.function == RPNElement::FUNCTION_NOT_IN_RANGE)
{
// Only FUNCTION_EQUALS is supported and for that extremes need to be the same
if (rpn_element.range.left != rpn_element.range.right)
{
condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
continue;
}
const auto * parquet_column_descriptor =
getColumnDescriptorIfBloomFilterIsPresent(parquet_rg_metadata, clickhouse_column_index_to_parquet_index, rpn_element.key_column);
if (!parquet_column_descriptor)
{
condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
continue;
}
auto hashed_value = tryHash(rpn_element.range.left, parquet_column_descriptor);
if (!hashed_value)
{
condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
continue;
}
std::vector<uint64_t> hashes_for_column;
hashes_for_column.emplace_back(*hashed_value);
hashes.emplace_back(std::move(hashes_for_column));
auto function = rpn_element.function == RPNElement::FUNCTION_IN_RANGE
? ParquetBloomFilterCondition::ConditionElement::Function::FUNCTION_IN
: ParquetBloomFilterCondition::ConditionElement::Function::FUNCTION_NOT_IN;
std::vector<std::size_t> key_columns;
key_columns.emplace_back(rpn_element.key_column);
condition_elements.emplace_back(function, std::move(hashes), std::move(key_columns));
}
else if (rpn_element.function == RPNElement::FUNCTION_IN_SET
|| rpn_element.function == RPNElement::FUNCTION_NOT_IN_SET)
{
const auto & set_index = rpn_element.set_index;
const auto & ordered_set = set_index->getOrderedSet();
const auto & indexes_mapping = set_index->getIndexesMapping();
bool found_empty_column = false;
std::vector<std::size_t> key_columns;
for (auto i = 0u; i < ordered_set.size(); i++)
{
const auto & set_column = ordered_set[i];
const auto * parquet_column_descriptor = getColumnDescriptorIfBloomFilterIsPresent(
parquet_rg_metadata,
clickhouse_column_index_to_parquet_index,
indexes_mapping[i].key_index);
if (!parquet_column_descriptor)
{
continue;
}
auto column = set_column;
if (column->empty())
{
found_empty_column = true;
break;
}
if (const auto & nullable_column = checkAndGetColumn<ColumnNullable>(set_column.get()))
{
column = nullable_column->getNestedColumnPtr();
}
auto hashes_for_column_opt = hash(column.get(), parquet_column_descriptor);
if (!hashes_for_column_opt)
{
continue;
}
auto & hashes_for_column = *hashes_for_column_opt;
if (hashes_for_column.empty())
{
continue;
}
hashes.emplace_back(hashes_for_column);
key_columns.push_back(indexes_mapping[i].key_index);
}
if (found_empty_column)
{
condition_elements.emplace_back(Function::ALWAYS_FALSE);
continue;
}
if (hashes.empty())
{
condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
continue;
}
auto function = RPNElement::FUNCTION_IN_SET == rpn_element.function ? Function::FUNCTION_IN : Function::FUNCTION_NOT_IN;
condition_elements.emplace_back(function, hashes, key_columns);
}
else if (rpn_element.function == RPNElement::FUNCTION_NOT)
{
condition_elements.emplace_back(Function::FUNCTION_NOT);
}
else if (rpn_element.function == RPNElement::FUNCTION_OR)
{
condition_elements.emplace_back(Function::FUNCTION_OR);
}
else if (rpn_element.function == RPNElement::FUNCTION_AND)
{
condition_elements.emplace_back(Function::FUNCTION_AND);
}
else
{
condition_elements.emplace_back(Function::ALWAYS_TRUE);
}
}
return condition_elements;
}
}
#endif

View File

@ -0,0 +1,73 @@
#pragma once
#include <config.h>
#if USE_PARQUET
#include <Storages/MergeTree/KeyCondition.h>
#include <parquet/metadata.h>
#include <Processors/Formats/Impl/ArrowFieldIndexUtil.h>
namespace parquet
{
class BloomFilter;
}
namespace DB
{
class ParquetBloomFilterCondition
{
public:
struct ConditionElement
{
enum Function
{
/// Atoms of a Boolean expression.
FUNCTION_IN,
FUNCTION_NOT_IN,
/// Can take any value.
FUNCTION_UNKNOWN,
/// Operators of the logical expression.
FUNCTION_NOT,
FUNCTION_AND,
FUNCTION_OR,
/// Constants
ALWAYS_FALSE,
ALWAYS_TRUE,
};
using ColumnPtr = IColumn::Ptr;
using HashesForColumns = std::vector<std::vector<uint64_t>>;
using KeyColumns = std::vector<std::size_t>;
Function function;
// each entry represents a list of hashes per column
// suppose there are three columns with 2 rows each
// hashes_per_column.size() == 3 and hashes_per_column[0].size() == 2
HashesForColumns hashes_per_column;
KeyColumns key_columns;
};
using RPNElement = KeyCondition::RPNElement;
using ColumnIndexToBF = std::unordered_map<std::size_t, std::unique_ptr<parquet::BloomFilter>>;
explicit ParquetBloomFilterCondition(const std::vector<ConditionElement> & condition_, const Block & header_);
bool mayBeTrueOnRowGroup(const ColumnIndexToBF & column_index_to_column_bf) const;
std::unordered_set<std::size_t> getFilteringColumnKeys() const;
private:
std::vector<ParquetBloomFilterCondition::ConditionElement> condition;
Block header;
};
std::vector<ParquetBloomFilterCondition::ConditionElement> keyConditionRPNToParquetBloomFilterCondition(
const std::vector<KeyCondition::RPNElement> & rpn,
const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
const std::unique_ptr<parquet::RowGroupMetaData> & parquet_rg_metadata);
}
#endif

View File

@ -14,6 +14,8 @@
#include <arrow/status.h>
#include <parquet/arrow/reader.h>
#include <parquet/arrow/schema.h>
#include <parquet/bloom_filter.h>
#include <parquet/bloom_filter_reader.h>
#include <parquet/file_reader.h>
#include <parquet/statistics.h>
#include "ArrowBufferedStreams.h"
@ -25,6 +27,7 @@
#include <DataTypes/DataTypeNullable.h>
#include <Common/FieldVisitorsAccurateComparison.h>
#include <Processors/Formats/Impl/Parquet/ParquetRecordReader.h>
#include <Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h>
#include <Interpreters/convertFieldToType.h>
namespace CurrentMetrics
@ -263,6 +266,50 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type
return field;
}
static ParquetBloomFilterCondition::ColumnIndexToBF buildColumnIndexToBF(
parquet::BloomFilterReader & bf_reader,
int row_group,
const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
const std::unordered_set<std::size_t> & filtering_columns
)
{
auto rg_bf = bf_reader.RowGroup(row_group);
if (!rg_bf)
{
return {};
}
ParquetBloomFilterCondition::ColumnIndexToBF index_to_column_bf;
for (const auto & [clickhouse_index, parquet_indexes] : clickhouse_column_index_to_parquet_index)
{
if (!filtering_columns.contains(clickhouse_index))
{
continue;
}
// Complex / nested types contain more than one index. We don't support those.
if (parquet_indexes.size() > 1)
{
continue;
}
auto parquet_index = parquet_indexes[0];
auto bf = rg_bf->GetColumnBloomFilter(parquet_index);
if (!bf)
{
continue;
}
index_to_column_bf[clickhouse_index] = std::move(bf);
}
return index_to_column_bf;
}
/// Range of values for each column, based on statistics in the Parquet metadata.
/// This is lower/upper bounds, not necessarily exact min and max, e.g. the min/max can be just
/// missing in the metadata.
@ -474,9 +521,27 @@ void ParquetBlockInputFormat::initializeIfNeeded()
ArrowFieldIndexUtil field_util(
format_settings.parquet.case_insensitive_column_matching,
format_settings.parquet.allow_missing_columns);
column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
auto index_mapping = field_util.findRequiredIndices(getPort().getHeader(), *schema, *metadata);
for (const auto & [clickhouse_header_index, parquet_indexes] : index_mapping)
{
for (auto parquet_index : parquet_indexes)
{
column_indices.push_back(parquet_index);
}
}
int num_row_groups = metadata->num_row_groups();
if (num_row_groups == 0)
{
return;
}
const auto bf_reader_properties = parquet::default_reader_properties();
std::unique_ptr<parquet::BloomFilterReader> bf_reader;
row_group_batches.reserve(num_row_groups);
auto adaptive_chunk_size = [&](int row_group_idx) -> size_t
@ -497,11 +562,38 @@ void ParquetBlockInputFormat::initializeIfNeeded()
return std::min(std::max(preferred_num_rows, MIN_ROW_NUM), static_cast<size_t>(format_settings.parquet.max_block_size));
};
std::unique_ptr<ParquetBloomFilterCondition> parquet_bloom_filter_condition;
std::unordered_set<std::size_t> filtering_columns;
if (format_settings.parquet.bloom_filter_push_down && key_condition)
{
bf_reader = parquet::BloomFilterReader::Make(arrow_file, metadata, bf_reader_properties, nullptr);
const auto parquet_conditions = keyConditionRPNToParquetBloomFilterCondition(
key_condition->getRPN(),
index_mapping,
metadata->RowGroup(0));
parquet_bloom_filter_condition = std::make_unique<ParquetBloomFilterCondition>(parquet_conditions, getPort().getHeader());
filtering_columns = parquet_bloom_filter_condition->getFilteringColumnKeys();
}
for (int row_group = 0; row_group < num_row_groups; ++row_group)
{
if (skip_row_groups.contains(row_group))
continue;
if (parquet_bloom_filter_condition)
{
const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns);
if (!parquet_bloom_filter_condition->mayBeTrueOnRowGroup(column_index_to_bf))
{
continue;
}
}
if (format_settings.parquet.filter_push_down && key_condition
&& !key_condition
->checkInHyperrectangle(

View File

@ -38,6 +38,7 @@ private:
bool supportsCountRows() const override { return true; }
size_t countRows(size_t max_block_size) override;
bool supportsCustomSerializations() const override { return true; }
const FormatSettings format_settings;

View File

@ -48,6 +48,7 @@ protected:
bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override;
void setReadBuffer(ReadBuffer & in_) override;
void readPrefix() override;
bool supportsCustomSerializations() const override { return true; }
const FormatSettings format_settings;
DataTypes data_types;

View File

@ -817,7 +817,7 @@ void KeyCondition::getAllSpaceFillingCurves()
KeyCondition::KeyCondition(
const ActionsDAG * filter_dag,
ContextPtr context,
const Names & key_column_names,
const Names & key_column_names_,
const ExpressionActionsPtr & key_expr_,
bool single_point_)
: key_expr(key_expr_)
@ -825,7 +825,7 @@ KeyCondition::KeyCondition(
, single_point(single_point_)
{
size_t key_index = 0;
for (const auto & name : key_column_names)
for (const auto & name : key_column_names_)
{
if (!key_columns.contains(name))
{

View File

@ -149,7 +149,6 @@ void traverse(const ParserImpl::Element & e, std::shared_ptr<JSONNode> node)
std::shared_ptr<JSONNode> parseJSON(const String & json)
{
std::string_view view{json.begin(), json.end()};
ParserImpl::Element document;
ParserImpl p;

View File

@ -62,6 +62,7 @@ public:
/// Avoid loading nested table by returning nullptr/false for all table functions.
StoragePolicyPtr getStoragePolicy() const override { return nullptr; }
bool storesDataOnDisk() const override { return false; }
bool supportsReplication() const override { return false; }
void startup() override { }
void shutdown(bool is_drop) override

View File

@ -0,0 +1,16 @@
<clickhouse>
<remote_servers>
<cluster>
<shard>
<replica>
<host>node1</host>
<port>9000</port>
</replica>
<replica>
<host>node2</host>
<port>9000</port>
</replica>
</shard>
</cluster>
</remote_servers>
</clickhouse>

View File

@ -35,6 +35,7 @@ from pyspark.sql.window import Window
import helpers.client
from helpers.cluster import ClickHouseCluster
from helpers.network import PartitionManager
from helpers.s3_tools import (
get_file_contents,
list_s3_objects,
@ -74,10 +75,23 @@ def started_cluster():
main_configs=[
"configs/config.d/named_collections.xml",
"configs/config.d/filesystem_caches.xml",
"configs/config.d/remote_servers.xml",
],
user_configs=["configs/users.d/users.xml"],
with_minio=True,
stay_alive=True,
with_zookeeper=True,
)
cluster.add_instance(
"node2",
main_configs=[
"configs/config.d/named_collections.xml",
"configs/config.d/remote_servers.xml",
],
user_configs=["configs/users.d/users.xml"],
with_minio=True,
stay_alive=True,
with_zookeeper=True,
)
logging.info("Starting cluster...")
@ -891,3 +905,100 @@ def test_filesystem_cache(started_cluster, storage_type):
f"SELECT ProfileEvents['S3GetObject'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
)
)
def test_replicated_database_and_unavailable_s3(started_cluster):
node1 = started_cluster.instances["node1"]
node2 = started_cluster.instances["node2"]
DB_NAME = randomize_table_name("db")
TABLE_NAME = randomize_table_name("test_replicated_database_and_unavailable_s3")
minio_client = started_cluster.minio_client
bucket = started_cluster.minio_restricted_bucket
if not minio_client.bucket_exists(bucket):
minio_client.make_bucket(bucket)
node1.query(
f"CREATE DATABASE {DB_NAME} ENGINE=Replicated('/clickhouse/databases/{DB_NAME}', 'shard1', 'node1')"
)
node2.query(
f"CREATE DATABASE {DB_NAME} ENGINE=Replicated('/clickhouse/databases/{DB_NAME}', 'shard1', 'node2')"
)
parquet_data_path = create_initial_data_file(
started_cluster,
node1,
"SELECT number, toString(number) FROM numbers(100)",
TABLE_NAME,
)
endpoint_url = f"http://{started_cluster.minio_ip}:{started_cluster.minio_port}"
aws_access_key_id = "minio"
aws_secret_access_key = "minio123"
schema = pa.schema(
[
("id", pa.int32()),
("name", pa.string()),
]
)
data = [
pa.array([1, 2, 3], type=pa.int32()),
pa.array(["John Doe", "Jane Smith", "Jake Johnson"], type=pa.string()),
]
storage_options = {
"AWS_ENDPOINT_URL": endpoint_url,
"AWS_ACCESS_KEY_ID": aws_access_key_id,
"AWS_SECRET_ACCESS_KEY": aws_secret_access_key,
"AWS_ALLOW_HTTP": "true",
"AWS_S3_ALLOW_UNSAFE_RENAME": "true",
}
path = f"s3://root/{TABLE_NAME}"
table = pa.Table.from_arrays(data, schema=schema)
write_deltalake(path, table, storage_options=storage_options)
with PartitionManager() as pm:
pm_rule_reject = {
"probability": 1,
"destination": node2.ip_address,
"source_port": started_cluster.minio_port,
"action": "REJECT --reject-with tcp-reset",
}
pm_rule_drop_all = {
"destination": node2.ip_address,
"source_port": started_cluster.minio_port,
"action": "DROP",
}
pm._add_rule(pm_rule_reject)
node1.query(
f"""
DROP TABLE IF EXISTS {DB_NAME}.{TABLE_NAME};
CREATE TABLE {DB_NAME}.{TABLE_NAME}
AS deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/root/{TABLE_NAME}' , 'minio', 'minio123')
"""
)
assert TABLE_NAME in node1.query(
f"select name from system.tables where database = '{DB_NAME}'"
)
assert TABLE_NAME in node2.query(
f"select name from system.tables where database = '{DB_NAME}'"
)
replica_path = f"/clickhouse/databases/{DB_NAME}/replicas/shard1|node2"
zk = started_cluster.get_kazoo_client("zoo1")
zk.set(replica_path + "/digest", "123456".encode())
assert "123456" in node2.query(
f"SELECT * FROM system.zookeeper WHERE path = '{replica_path}'"
)
node2.restart_clickhouse()
assert "123456" not in node2.query(
f"SELECT * FROM system.zookeeper WHERE path = '{replica_path}'"
)

View File

@ -0,0 +1,347 @@
1000
bloom filter is off, all row groups should be read
expect rows_read = select count()
{
"data": [
{
"string": "AZSR",
"flba": "WNMM"
},
{
"string": "PFJH",
"flba": "GKJC"
}
],
"rows": 2,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
bloom filter is on, some row groups should be skipped
expect rows_read much less than select count()
{
"data": [
{
"string": "AZSR",
"flba": "WNMM"
},
{
"string": "PFJH",
"flba": "GKJC"
}
],
"rows": 2,
"statistics": {
"rows_read": 464,
"bytes_read": 21703
}
}
bloom filter is on, but where predicate contains data from 2 row groups out of 3.
Rows read should be less than select count, but greater than previous selects
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
},
{
"string": "ZHZK",
"flba": "HRWD"
}
],
"rows": 2,
"statistics": {
"rows_read": 536,
"bytes_read": 25708
}
}
bloom filter is on, but where predicate contains data from all row groups
expect rows_read = select count()
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
},
{
"string": "OKAI",
"flba": "UXGT"
},
{
"string": "ZHZK",
"flba": "HRWD"
}
],
"rows": 3,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
IN check
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
},
{
"string": "ZHZK",
"flba": "HRWD"
}
],
"rows": 2,
"statistics": {
"rows_read": 536,
"bytes_read": 25708
}
}
tuple in case, bf is off.
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
tuple in case, bf is on.
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
}
],
"rows": 1,
"statistics": {
"rows_read": 464,
"bytes_read": 21703
}
}
complex tuple in case, bf is off
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
complex tuple in case, bf is on
{
"data": [
{
"string": "PFJH",
"flba": "GKJC"
}
],
"rows": 1,
"statistics": {
"rows_read": 464,
"bytes_read": 21703
}
}
complex tuple in case, bf is on. Non existent
{
"data": [],
"rows": 0,
"statistics": {
"rows_read": 0,
"bytes_read": 0
}
}
Bloom filter for json column. BF is off
{
"data": [
{
"json": "{\"key\":38, \"value\":\"NXONM\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
Bloom filter for json column. BF is on
{
"data": [
{
"json": "{\"key\":38, \"value\":\"NXONM\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 72,
"bytes_read": 4005
}
}
Bloom filter for ipv4 column. BF is off
{
"data": [
{
"json": "{\"key\":38, \"value\":\"NXONM\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
Bloom filter for ipv4 column. BF is on
{
"data": [
{
"json": "{\"key\":38, \"value\":\"NXONM\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 72,
"bytes_read": 4005
}
}
Bloom filter for ipv4 column. BF is on. Specified in the schema
{
"data": [
{
"ipv4": "0.0.1.143"
}
],
"rows": 1,
"statistics": {
"rows_read": 72,
"bytes_read": 4005
}
}
Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg
{
"data": [
{
"uint64_logical": "22.230.220.164"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
BF off for parquet uint64 logical type. Should read everything
{
"data": [
{
"json": "{\"key\":683, \"value\":\"YKCPD\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
BF on for parquet uint64 logical type. Uint64 is stored as a signed int 64, but with logical annotation. Make sure a value greater than int64 can be queried
{
"data": [
{
"json": "{\"key\":683, \"value\":\"YKCPD\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 464,
"bytes_read": 21711
}
}
Uint16 is stored as physical type int32 with bidwidth = 16 and sign = false. Make sure a value greater than int16 can be queried. BF is on.
{
"data": [
{
"json": "{\"key\":874, \"value\":\"JENHW\"}"
}
],
"rows": 1,
"statistics": {
"rows_read": 464,
"bytes_read": 21703
}
}
BF off for parquet int8 logical type. Should read everything
{
"data": [
{
"json": "{\"key\":89, \"value\":\"MFIYP\"}"
},
{
"json": "{\"key\":321, \"value\":\"JNOIA\"}"
},
{
"json": "{\"key\":938, \"value\":\"UBMLO\"}"
},
{
"json": "{\"key\":252, \"value\":\"ZVLKF\"}"
}
],
"rows": 4,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
BF on for parquet int8 logical type. Should skip row groups
{
"data": [
{
"json": "{\"key\":89, \"value\":\"MFIYP\"}"
},
{
"json": "{\"key\":321, \"value\":\"JNOIA\"}"
},
{
"json": "{\"key\":938, \"value\":\"UBMLO\"}"
},
{
"json": "{\"key\":252, \"value\":\"ZVLKF\"}"
}
],
"rows": 4,
"statistics": {
"rows_read": 536,
"bytes_read": 25716
}
}
Invalid column conversion with in operation. String type can not be hashed against parquet int64 physical type. Should read everything
{
"data": [],
"rows": 0,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}
Transformations on key column shall not be allowed. Should read everything
{
"data": [
{
"uint64_logical": "7711695863945021976"
}
],
"rows": 1,
"statistics": {
"rows_read": 1000,
"bytes_read": 47419
}
}

View File

@ -0,0 +1,96 @@
#!/usr/bin/env bash
# Tags: no-ubsan, no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}"
mkdir -p "${WORKING_DIR}"
DATA_FILE="${CUR_DIR}/data_parquet/multi_column_bf.gz.parquet"
DATA_FILE_USER_PATH="${WORKING_DIR}/multi_column_bf.gz.parquet"
cp ${DATA_FILE} ${DATA_FILE_USER_PATH}
${CLICKHOUSE_CLIENT} --query="select count(*) from file('${DATA_FILE_USER_PATH}', Parquet) SETTINGS use_cache_for_count_from_files=false;"
echo "bloom filter is off, all row groups should be read"
echo "expect rows_read = select count()"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false" | jq 'del(.meta,.statistics.elapsed)'
echo "bloom filter is on, some row groups should be skipped"
echo "expect rows_read much less than select count()"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false" | jq 'del(.meta,.statistics.elapsed)'
echo "bloom filter is on, but where predicate contains data from 2 row groups out of 3."
echo "Rows read should be less than select count, but greater than previous selects"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "bloom filter is on, but where predicate contains data from all row groups"
echo "expect rows_read = select count()"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' or uint64_logical=18441251162536403933 order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "IN check"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string in ('PFJH', 'ZHZK') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "tuple in case, bf is off."
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "tuple in case, bf is on."
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "complex tuple in case, bf is off"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "complex tuple in case, bf is on"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "complex tuple in case, bf is on. Non existent"
${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('NON2', 'NON2'), ('NON3', 'NON3')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Bloom filter for json column. BF is off"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Bloom filter for json column. BF is on"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Bloom filter for ipv4 column. BF is off"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Bloom filter for ipv4 column. BF is on"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Bloom filter for ipv4 column. BF is on. Specified in the schema"
${CLICKHOUSE_CLIENT} --query="select ipv4 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv4 IPv4') where ipv4 = toIPv4('0.0.1.143') order by ipv4 asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg"
${CLICKHOUSE_CLIENT} --query="select uint64_logical from file ('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical IPv4') where uint64_logical = toIPv4(5552715629697883300) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "BF off for parquet uint64 logical type. Should read everything"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "BF on for parquet uint64 logical type. Uint64 is stored as a signed int 64, but with logical annotation. Make sure a value greater than int64 can be queried"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Uint16 is stored as physical type int32 with bidwidth = 16 and sign = false. Make sure a value greater than int16 can be queried. BF is on."
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint16_logical=65528 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "BF off for parquet int8 logical type. Should read everything"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "BF on for parquet int8 logical type. Should skip row groups"
${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Invalid column conversion with in operation. String type can not be hashed against parquet int64 physical type. Should read everything"
${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical String') where uint64_logical in ('5') order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "Transformations on key column shall not be allowed. Should read everything"
${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) = -7711695863945021976 order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
rm -rf ${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/*

View File

@ -0,0 +1,76 @@
bloom filter is off, row groups should be read
expect rows_read = select count()
{
"data": [
{
"ipv6": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
}
],
"rows": 1,
"statistics": {
"rows_read": 5,
"bytes_read": 128
}
}
bloom filter is on for ipv6, row groups should also be read since there is only one. Below queries just make sure the data is properly returned
{
"data": [
{
"ipv6": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
}
],
"rows": 1,
"statistics": {
"rows_read": 5,
"bytes_read": 128
}
}
{
"data": [
{
"ipv6": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
}
],
"rows": 1,
"statistics": {
"rows_read": 5,
"bytes_read": 128
}
}
{
"data": [
{
"toIPv6(ipv6)": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
}
],
"rows": 1,
"statistics": {
"rows_read": 5,
"bytes_read": 128
}
}
non existent ipv6, row group should be skipped
{
"data": [],
"rows": 0,
"statistics": {
"rows_read": 0,
"bytes_read": 0
}
}
{
"data": [],
"rows": 0,
"statistics": {
"rows_read": 0,
"bytes_read": 0
}
}
{
"data": [],
"rows": 0,
"statistics": {
"rows_read": 5,
"bytes_read": 128
}
}

View File

@ -0,0 +1,33 @@
#!/usr/bin/env bash
# Tags: no-ubsan, no-fasttest
CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CUR_DIR"/../shell_config.sh
USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}"
mkdir -p "${WORKING_DIR}"
DATA_FILE="${CUR_DIR}/data_parquet/ipv6_bloom_filter.gz.parquet"
DATA_FILE_USER_PATH="${WORKING_DIR}/ipv6_bloom_filter.gz.parquet"
cp ${DATA_FILE} ${DATA_FILE_USER_PATH}
echo "bloom filter is off, row groups should be read"
echo "expect rows_read = select count()"
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = '7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "bloom filter is on for ipv6, row groups should also be read since there is only one. Below queries just make sure the data is properly returned"
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = '7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = toIPv6('7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where ipv6 = toIPv6('7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
echo "non existent ipv6, row group should be skipped"
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = 'fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = toIPv6('fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where ipv6 = toIPv6('fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'

View File

@ -0,0 +1,102 @@
Arrow
9260153077572524277
ArrowStream
9260153077572524277
Avro
9260153077572524277
BSONEachRow
9260153077572524277
CSV
9260153077572524277
CSVWithNames
9260153077572524277
CSVWithNamesAndTypes
9260153077572524277
CapnProto
9260153077572524277
CustomSeparated
9260153077572524277
CustomSeparatedWithNames
9260153077572524277
CustomSeparatedWithNamesAndTypes
9260153077572524277
JSON
9260153077572524277
JSONColumns
9260153077572524277
JSONColumnsWithMetadata
9260153077572524277
JSONCompact
9260153077572524277
JSONCompactColumns
9260153077572524277
JSONCompactEachRow
9260153077572524277
JSONCompactEachRowWithNames
9260153077572524277
JSONCompactEachRowWithNamesAndTypes
9260153077572524277
JSONCompactStringsEachRow
9260153077572524277
JSONCompactStringsEachRowWithNames
9260153077572524277
JSONCompactStringsEachRowWithNamesAndTypes
9260153077572524277
JSONEachRow
9260153077572524277
JSONLines
9260153077572524277
JSONObjectEachRow
9260153077572524277
JSONStringsEachRow
9260153077572524277
MsgPack
9260153077572524277
NDJSON
9260153077572524277
Native
9260153077572524277
ORC
9260153077572524277
Parquet
9260153077572524277
Raw
9260153077572524277
RawWithNames
9260153077572524277
RawWithNamesAndTypes
9260153077572524277
RowBinary
9260153077572524277
RowBinaryWithNames
9260153077572524277
RowBinaryWithNamesAndTypes
9260153077572524277
TSKV
9260153077572524277
TSV
9260153077572524277
TSVRaw
9260153077572524277
TSVRawWithNames
9260153077572524277
TSVRawWithNamesAndTypes
9260153077572524277
TSVWithNames
9260153077572524277
TSVWithNamesAndTypes
9260153077572524277
TabSeparated
9260153077572524277
TabSeparatedRaw
9260153077572524277
TabSeparatedRawWithNames
9260153077572524277
TabSeparatedRawWithNamesAndTypes
9260153077572524277
TabSeparatedWithNames
9260153077572524277
TabSeparatedWithNamesAndTypes
9260153077572524277
Values
9260153077572524277

View File

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Tags: no-fasttest, long
set -e
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
formats=$($CLICKHOUSE_CLIENT --query "
SELECT name FROM system.formats
WHERE is_input AND is_output AND name NOT IN ('Template', 'Npy', 'RawBLOB', 'ProtobufList', 'ProtobufSingle', 'Protobuf', 'LineAsString')
ORDER BY name FORMAT TSV
")
$CLICKHOUSE_CLIENT --query "
DROP TABLE IF EXISTS t_sparse_all_formats;
CREATE TABLE t_sparse_all_formats (a UInt64, b UInt64, c String) ENGINE = MergeTree ORDER BY a;
"
for format in $formats; do
echo $format
$CLICKHOUSE_CLIENT --query "INSERT INTO t_sparse_all_formats(a) SELECT number FROM numbers(1000)"
$CLICKHOUSE_CLIENT --query "SELECT number AS a, 0::UInt64 AS b, '' AS c FROM numbers(1000) FORMAT $format" \
| ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_sparse_all_formats+FORMAT+$format&enable_parsing_to_custom_serialization=1" --data-binary @-
$CLICKHOUSE_CLIENT --query "SELECT number AS a FROM numbers(1000) FORMAT $format" \
| ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_sparse_all_formats(a)+FORMAT+$format&enable_parsing_to_custom_serialization=1" --data-binary @-
$CLICKHOUSE_CLIENT --query "
SELECT sum(sipHash64(*)) FROM t_sparse_all_formats;
TRUNCATE TABLE t_sparse_all_formats;
"
done