Merge remote-tracking branch 'ClickHouse/master' into query-time-ef-search

2024-11-21 15:12:02 +00:00 · 2024-10-16 09:43:28 +00:00 · 2024-10-16 09:43:28 +00:00 · 4b6b152562
commit 4b6b152562
parent 0caae39142 5227fe0934
43 changed files with 1664 additions and 52 deletions
--- a/cmake/cxx.cmake
+++ b/cmake/cxx.cmake
@ -1,4 +1,21 @@
-set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_DEBUG=0") # More checks in debug build.
+if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG")
+    # Enable libcxx debug mode: https://releases.llvm.org/15.0.0/projects/libcxx/docs/DesignDocs/DebugMode.html
+    # The docs say the debug mode violates complexity guarantees, so do this only for Debug builds.
+    # set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_ENABLE_DEBUG_MODE=1")
+    # ^^ Crashes the database upon startup, needs investigation.
+    #    Besides that, the implementation looks like a poor man's MSAN specific to libcxx. Since CI tests MSAN
+    #    anyways, we can keep the debug mode disabled.
+
+    # Libcxx also provides extra assertions:
+    # --> https://releases.llvm.org/15.0.0/projects/libcxx/docs/UsingLibcxx.html#assertions-mode
+    # These look orthogonal to the debug mode but the debug mode enables them implicitly:
+    # --> https://github.com/llvm/llvm-project/blob/release/15.x/libcxx/include/__assert#L29
+    # They are cheap and straightforward, so enable them in debug builds:
+    set (CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -D_LIBCPP_ENABLE_ASSERTIONS=1")
+
+    # TODO Once we upgrade to LLVM 18+, reconsider all of the above as they introduced "hardening modes":
+    # https://libcxx.llvm.org/Hardening.html
+endif ()

 add_subdirectory(contrib/libcxxabi-cmake)
 add_subdirectory(contrib/libcxx-cmake)
--- a/contrib/abseil-cpp-cmake/CMakeLists.txt
+++ b/contrib/abseil-cpp-cmake/CMakeLists.txt
@ -1,6 +1,9 @@
 set(ABSL_ROOT_DIR "${ClickHouse_SOURCE_DIR}/contrib/abseil-cpp")
 set(ABSL_COMMON_INCLUDE_DIRS "${ABSL_ROOT_DIR}")

+# To avoid errors "'X' does not refer to a value" while using `offsetof` function.
+set(CMAKE_CXX_STANDARD 17)
+
 # This is a minimized version of the function definition in CMake/AbseilHelpers.cmake

 #
--- a/contrib/google-protobuf-cmake/CMakeLists.txt
+++ b/contrib/google-protobuf-cmake/CMakeLists.txt
@ -5,6 +5,9 @@ if(NOT ENABLE_PROTOBUF)
  return()
 endif()

+# To avoid errors "'X' does not refer to a value" while using `offsetof` function.
+set(CMAKE_CXX_STANDARD 17)
+
 set(Protobuf_INCLUDE_DIR "${ClickHouse_SOURCE_DIR}/contrib/google-protobuf/src")
 if(OS_FREEBSD AND SANITIZE STREQUAL "address")
  # ../contrib/protobuf/src/google/protobuf/arena_impl.h:45:10: fatal error: 'sanitizer/asan_interface.h' file not found
--- a/contrib/grpc-cmake/CMakeLists.txt
+++ b/contrib/grpc-cmake/CMakeLists.txt
@ -6,6 +6,8 @@ if(NOT ENABLE_GRPC)
  return()
 endif()

+set(CMAKE_CXX_STANDARD 17)
+
 set(_gRPC_SOURCE_DIR "${ClickHouse_SOURCE_DIR}/contrib/grpc")
 set(_gRPC_BINARY_DIR "${ClickHouse_BINARY_DIR}/contrib/grpc")

--- a/contrib/grpc-cmake/grpc.cmake
+++ b/contrib/grpc-cmake/grpc.cmake
@ -22,7 +22,7 @@
 # limitations under the License.

 # We want to use C++23, but GRPC is not ready
-set (CMAKE_CXX_STANDARD 20)
+set (CMAKE_CXX_STANDARD 17)

 set(_gRPC_ZLIB_INCLUDE_DIR "")
 set(_gRPC_ZLIB_LIBRARIES ch_contrib::zlib)
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -2933,7 +2933,42 @@ The same as ‘today() - 1’.

 ## timeSlot

-Rounds the time to the half hour.
+Round the time to the start of a half-an-hour length interval.
+
+**Syntax**
+
+```sql
+timeSlot(time[, time_zone])
+```
+
+**Arguments**
+
+- `time` — Time to round to the start of a half-an-hour length interval. [DateTime](../data-types/datetime.md)/[Date32](../data-types/date32.md)/[DateTime64](../data-types/datetime64.md).
+- `time_zone` — A String type const value or an expression representing the time zone. [String](../data-types/string.md).
+
+:::note
+Though this function can take values of the extended types `Date32` and `DateTime64` as an argument, passing it a time outside the normal range (year 1970 to 2149 for `Date` / 2106 for `DateTime`) will produce wrong results.
+:::
+
+**Return type**
+
+- Returns the time rounded to the start of a half-an-hour length interval. [DateTime](../data-types/datetime.md).
+
+**Example**
+
+Query:
+
+```sql
+SELECT timeSlot(toDateTime('2000-01-02 03:04:05', 'UTC'));
+```
+
+Result:
+
+```response
+┌─timeSlot(toDateTime('2000-01-02 03:04:05', 'UTC'))─┐
+│                                2000-01-02 03:00:00 │
+└────────────────────────────────────────────────────┘
+```

 ## toYYYYMM

--- a/src/Common/RemoteProxyConfigurationResolver.cpp
+++ b/src/Common/RemoteProxyConfigurationResolver.cpp
@ -31,7 +31,7 @@ std::string RemoteProxyHostFetcherImpl::fetch(const Poco::URI & endpoint, const
            endpoint.toString(),
            response.getStatus(),
            response.getReason(),
-            "");
+            /* body_length = */ 0);

    std::string proxy_host;
    Poco::StreamCopier::copyToString(response_body_stream, proxy_host);
--- a/src/Common/proxyConfigurationToPocoProxyConfig.cpp
+++ b/src/Common/proxyConfigurationToPocoProxyConfig.cpp
@ -25,15 +25,11 @@ namespace
 * `curl` strips leading dot and accepts url gitlab.com as a match for no_proxy .gitlab.com,
 * while `wget` does an exact match.
 * */
-std::string buildPocoRegexpEntryWithoutLeadingDot(const std::string & host)
+std::string buildPocoRegexpEntryWithoutLeadingDot(std::string_view host)
 {
-    std::string_view view_without_leading_dot = host;
-    if (host[0] == '.')
-    {
-        view_without_leading_dot = std::string_view {host.begin() + 1u, host.end()};
-    }
-
-    return RE2::QuoteMeta(view_without_leading_dot);
+    if (host.starts_with('.'))
+        host.remove_prefix(1);
+    return RE2::QuoteMeta(host);
 }

 }
--- a/src/Core/FormatFactorySettingsDeclaration.h
+++ b/src/Core/FormatFactorySettingsDeclaration.h
@ -170,6 +170,9 @@ Avoid reordering rows when reading from Parquet files. Usually makes it much slo
 )", 0) \
    M(Bool, input_format_parquet_filter_push_down, true, R"(
 When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and min/max statistics in the Parquet metadata.
+)", 0) \
+    M(Bool, input_format_parquet_bloom_filter_push_down, false, R"(
+When reading Parquet files, skip whole row groups based on the WHERE expressions and bloom filter in the Parquet metadata.
 )", 0) \
    M(Bool, input_format_parquet_use_native_reader, false, R"(
 When reading Parquet files, to use native reader instead of arrow reader.
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -102,6 +102,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"allow_experimental_refreshable_materialized_view", false, true, "Not experimental anymore"},
            {"max_parts_to_move", 1000, 1000, "New setting"},
            {"hnsw_candidate_list_size_for_search", 0, 0, "New setting"},
+            {"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
        }
    },
    {"24.9",
--- a/src/DataTypes/EnumValues.h
+++ b/src/DataTypes/EnumValues.h
@ -36,8 +36,8 @@ public:

    auto findByValue(const T & value) const
    {
-        const auto it = value_to_name_map.find(value);
-        if (it == std::end(value_to_name_map))
+        auto it = value_to_name_map.find(value);
+        if (it == value_to_name_map.end())
            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected value {} in enum", toString(value));

        return it;
@ -58,7 +58,7 @@ public:
    bool getNameForValue(const T & value, StringRef & result) const
    {
        const auto it = value_to_name_map.find(value);
-        if (it == std::end(value_to_name_map))
+        if (it == value_to_name_map.end())
            return false;

        result = it->second;
--- a/src/DataTypes/IDataType.cpp
+++ b/src/DataTypes/IDataType.cpp
@ -321,6 +321,8 @@ bool isUInt8(TYPE data_type) { return WhichDataType(data_type).isUInt8(); } \
 bool isUInt16(TYPE data_type) { return WhichDataType(data_type).isUInt16(); } \
 bool isUInt32(TYPE data_type) { return WhichDataType(data_type).isUInt32(); } \
 bool isUInt64(TYPE data_type) { return WhichDataType(data_type).isUInt64(); } \
+bool isUInt128(TYPE data_type) { return WhichDataType(data_type).isUInt128(); } \
+bool isUInt256(TYPE data_type) { return WhichDataType(data_type).isUInt256(); } \
 bool isNativeUInt(TYPE data_type) { return WhichDataType(data_type).isNativeUInt(); } \
 bool isUInt(TYPE data_type) { return WhichDataType(data_type).isUInt(); } \
 \
@ -328,6 +330,8 @@ bool isInt8(TYPE data_type) { return WhichDataType(data_type).isInt8(); } \
 bool isInt16(TYPE data_type) { return WhichDataType(data_type).isInt16(); } \
 bool isInt32(TYPE data_type) { return WhichDataType(data_type).isInt32(); } \
 bool isInt64(TYPE data_type) { return WhichDataType(data_type).isInt64(); } \
+bool isInt128(TYPE data_type) { return WhichDataType(data_type).isInt128(); } \
+bool isInt256(TYPE data_type) { return WhichDataType(data_type).isInt256(); } \
 bool isNativeInt(TYPE data_type) { return WhichDataType(data_type).isNativeInt(); } \
 bool isInt(TYPE data_type) { return WhichDataType(data_type).isInt(); } \
 \
--- a/src/DataTypes/IDataType.h
+++ b/src/DataTypes/IDataType.h
@ -457,7 +457,9 @@ struct WhichDataType
 bool isUInt8(TYPE data_type); \
 bool isUInt16(TYPE data_type); \
 bool isUInt32(TYPE data_type); \
-bool isUInt64(TYPE data_type); \
+bool isUInt64(TYPE data_type);\
+bool isUInt128(TYPE data_type);\
+bool isUInt256(TYPE data_type); \
 bool isNativeUInt(TYPE data_type); \
 bool isUInt(TYPE data_type); \
 \
@ -465,6 +467,8 @@ bool isInt8(TYPE data_type); \
 bool isInt16(TYPE data_type); \
 bool isInt32(TYPE data_type); \
 bool isInt64(TYPE data_type); \
+bool isInt128(TYPE data_type); \
+bool isInt256(TYPE data_type); \
 bool isNativeInt(TYPE data_type); \
 bool isInt(TYPE data_type); \
 \
--- a/src/Disks/IO/ThreadPoolReader.cpp
+++ b/src/Disks/IO/ThreadPoolReader.cpp
@ -111,9 +111,9 @@ std::future<IAsynchronousReader::Result> ThreadPoolReader::submit(Request reques
    /// RWF_NOWAIT flag may return 0 even when not at end of file.
    /// It can't be distinguished from the real eof, so we have to
    /// disable pread with nowait.
-    static std::atomic<bool> has_pread_nowait_support = !hasBugInPreadV2();
+    static const bool has_pread_nowait_support = !hasBugInPreadV2();

-    if (has_pread_nowait_support.load(std::memory_order_relaxed))
+    if (has_pread_nowait_support)
    {
        /// It reports real time spent including the time spent while thread was preempted doing nothing.
        /// And it is Ok for the purpose of this watch (it is used to lower the number of threads to read from tables).
@ -161,7 +161,8 @@ std::future<IAsynchronousReader::Result> ThreadPoolReader::submit(Request reques
                if (errno == ENOSYS || errno == EOPNOTSUPP)
                {
                    /// No support for the syscall or the flag in the Linux kernel.
-                    has_pread_nowait_support.store(false, std::memory_order_relaxed);
+                    /// It shouldn't happen because we check the kernel version but let's
+                    /// fallback to the thread pool.
                    break;
                }
                if (errno == EAGAIN)
--- a/src/Disks/ObjectStorages/S3/diskSettings.cpp
+++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp
@ -92,12 +92,26 @@ std::unique_ptr<S3::Client> getClient(
            "Region should be explicitly specified for directory buckets");
    }

+    const Settings & local_settings = context->getSettingsRef();
+
+    int s3_max_redirects = static_cast<int>(global_settings[Setting::s3_max_redirects]);
+    if (!for_disk_s3 && local_settings.isChanged("s3_max_redirects"))
+        s3_max_redirects = static_cast<int>(local_settings[Setting::s3_max_redirects]);
+
+    int s3_retry_attempts = static_cast<int>(global_settings[Setting::s3_retry_attempts]);
+    if (!for_disk_s3 && local_settings.isChanged("s3_retry_attempts"))
+        s3_retry_attempts = static_cast<int>(local_settings[Setting::s3_retry_attempts]);
+
+    bool enable_s3_requests_logging = global_settings[Setting::enable_s3_requests_logging];
+    if (!for_disk_s3 && local_settings.isChanged("enable_s3_requests_logging"))
+        enable_s3_requests_logging = local_settings[Setting::enable_s3_requests_logging];
+
    S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration(
        auth_settings.region,
        context->getRemoteHostFilter(),
-        static_cast<int>(global_settings[Setting::s3_max_redirects]),
-        static_cast<int>(global_settings[Setting::s3_retry_attempts]),
-        global_settings[Setting::enable_s3_requests_logging],
+        s3_max_redirects,
+        s3_retry_attempts,
+        enable_s3_requests_logging,
        for_disk_s3,
        request_settings.get_request_throttler,
        request_settings.put_request_throttler,
--- a/src/Formats/FormatFactory.cpp
+++ b/src/Formats/FormatFactory.cpp
@ -191,6 +191,7 @@ FormatSettings getFormatSettings(const ContextPtr & context, const Settings & se
    format_settings.parquet.case_insensitive_column_matching = settings[Setting::input_format_parquet_case_insensitive_column_matching];
    format_settings.parquet.preserve_order = settings[Setting::input_format_parquet_preserve_order];
    format_settings.parquet.filter_push_down = settings[Setting::input_format_parquet_filter_push_down];
+    format_settings.parquet.bloom_filter_push_down = settings[Setting::input_format_parquet_bloom_filter_push_down];
    format_settings.parquet.use_native_reader = settings[Setting::input_format_parquet_use_native_reader];
    format_settings.parquet.allow_missing_columns = settings[Setting::input_format_parquet_allow_missing_columns];
    format_settings.parquet.skip_columns_with_unsupported_types_in_schema_inference = settings[Setting::input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference];
--- a/src/Formats/FormatSettings.h
+++ b/src/Formats/FormatSettings.h
@ -274,6 +274,7 @@ struct FormatSettings
        bool skip_columns_with_unsupported_types_in_schema_inference = false;
        bool case_insensitive_column_matching = false;
        bool filter_push_down = true;
+        bool bloom_filter_push_down = true;
        bool use_native_reader = false;
        std::unordered_set<int> skip_row_groups = {};
        bool output_string_as_string = false;
--- a/src/IO/HTTPCommon.cpp
+++ b/src/IO/HTTPCommon.cpp
@ -84,11 +84,9 @@ void assertResponseIsOk(const String & uri, Poco::Net::HTTPResponse & response,
            ? ErrorCodes::RECEIVED_ERROR_TOO_MANY_REQUESTS
            : ErrorCodes::RECEIVED_ERROR_FROM_REMOTE_IO_SERVER;

-        std::stringstream body; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
-        body.exceptions(std::ios::failbit);
-        body << istr.rdbuf();
-
-        throw HTTPException(code, uri, status, response.getReason(), body.str());
+        istr.seekg(0, std::ios::end);
+        size_t body_length = istr.tellg();
+        throw HTTPException(code, uri, status, response.getReason(), body_length);
    }
 }

@ -97,13 +95,13 @@ Exception HTTPException::makeExceptionMessage(
    const std::string & uri,
    Poco::Net::HTTPResponse::HTTPStatus http_status,
    const std::string & reason,
-    const std::string & body)
+    size_t body_length)
 {
    return Exception(code,
        "Received error from remote server {}. "
-        "HTTP status code: {} {}, "
-        "body: {}",
-        uri, static_cast<int>(http_status), reason, body);
+        "HTTP status code: {} '{}', "
+        "body length: {} bytes",
+        uri, static_cast<int>(http_status), reason, body_length);
 }

 }
--- a/src/IO/HTTPCommon.h
+++ b/src/IO/HTTPCommon.h
@ -27,9 +27,9 @@ public:
        const std::string & uri,
        Poco::Net::HTTPResponse::HTTPStatus http_status_,
        const std::string & reason,
-        const std::string & body
+        size_t body_length = 0
    )
-        : Exception(makeExceptionMessage(code, uri, http_status_, reason, body))
+        : Exception(makeExceptionMessage(code, uri, http_status_, reason, body_length))
        , http_status(http_status_)
    {}

@ -46,7 +46,7 @@ private:
        const std::string & uri,
        Poco::Net::HTTPResponse::HTTPStatus http_status,
        const std::string & reason,
-        const std::string & body);
+        size_t body_length);

    const char * name() const noexcept override { return "DB::HTTPException"; }
    const char * className() const noexcept override { return "DB::HTTPException"; }
--- a/src/IO/ReadWriteBufferFromHTTP.cpp
+++ b/src/IO/ReadWriteBufferFromHTTP.cpp
@ -423,8 +423,7 @@ std::unique_ptr<ReadBuffer> ReadWriteBufferFromHTTP::initialize()
                    ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
                    current_uri.toString(),
                    Poco::Net::HTTPResponse::HTTP_REQUESTED_RANGE_NOT_SATISFIABLE,
-                    reason,
-                    "");
+                    reason);
            }
            throw Exception(
                ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
@ -549,8 +548,7 @@ size_t ReadWriteBufferFromHTTP::readBigAt(char * to, size_t n, size_t offset, co
                    ErrorCodes::HTTP_RANGE_NOT_SATISFIABLE,
                    current_uri.toString(),
                    Poco::Net::HTTPResponse::HTTP_REQUESTED_RANGE_NOT_SATISFIABLE,
-                    reason,
-                    "");
+                    reason);
            }

            copyFromIStreamWithProgressCallback(*result.response_stream, to, n, progress_callback, &bytes_copied, &is_canceled);
--- a/src/Interpreters/Set.h
+++ b/src/Interpreters/Set.h
@ -238,6 +238,8 @@ public:

    const Columns & getOrderedSet() const { return ordered_set; }

+    const std::vector<KeyTuplePositionMapping> & getIndexesMapping() const { return indexes_mapping; }
+
 private:
    // If all arguments in tuple are key columns, we can optimize NOT IN when there is only one element.
    bool has_all_keys;
--- a/src/Processors/Formats/IRowInputFormat.cpp
+++ b/src/Processors/Formats/IRowInputFormat.cpp
@ -274,7 +274,8 @@ size_t IRowInputFormat::countRows(size_t)

 void IRowInputFormat::setSerializationHints(const SerializationInfoByName & hints)
 {
-    serializations = getPort().getHeader().getSerializations(hints);
+    if (supportsCustomSerializations())
+        serializations = getPort().getHeader().getSerializations(hints);
 }


--- a/src/Processors/Formats/IRowInputFormat.h
+++ b/src/Processors/Formats/IRowInputFormat.h
@ -59,6 +59,7 @@ protected:
    /// `max_block_size` can be ignored.
    virtual size_t countRows(size_t max_block_size);
    virtual bool supportsCountRows() const { return false; }
+    virtual bool supportsCustomSerializations() const { return false; }

    virtual void readPrefix() {}                /// delimiter before begin of result
    virtual void readSuffix() {}                /// delimiter after end of result
--- a/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
+++ b/src/Processors/Formats/Impl/ArrowFieldIndexUtil.h
@ -15,6 +15,7 @@
 #include <arrow/type_fwd.h>
 #include <boost/algorithm/string/case_conv.hpp>
 #include <Common/Exception.h>
+#include <parquet/metadata.h>


 namespace arrow
@ -65,11 +66,22 @@ public:
        return result;
    }

+    // For a parquet schema {x: {i: int, j: int}}, this should be populated as follows
+    // clickhouse_index = 0, parquet_indexes = {0, 1}
+    struct ClickHouseIndexToParquetIndex
+    {
+        std::size_t clickhouse_index;
+        std::vector<int> parquet_indexes;
+    };
+
    /// Only collect the required fields' indices. Eg. when just read a field of a struct,
    /// don't need to collect the whole indices in this struct.
-    std::vector<int> findRequiredIndices(const Block & header, const arrow::Schema & schema)
+    std::vector<ClickHouseIndexToParquetIndex> findRequiredIndices(
+        const Block & header,
+        const arrow::Schema & schema,
+        const parquet::FileMetaData & file)
    {
-        std::vector<int> required_indices;
+        std::vector<ClickHouseIndexToParquetIndex> required_indices;
        std::unordered_set<int> added_indices;
        /// Flat all named fields' index information into a map.
        auto fields_indices = calculateFieldIndices(schema);
@ -79,7 +91,7 @@ public:
            std::string col_name = named_col.name;
            if (ignore_case)
                boost::to_lower(col_name);
-            findRequiredIndices(col_name, named_col.type, fields_indices, added_indices, required_indices);
+            findRequiredIndices(col_name, i, named_col.type, fields_indices, added_indices, required_indices, file);
        }
        return required_indices;
    }
@ -169,10 +181,12 @@ private:

    void findRequiredIndices(
        const String & name,
+        std::size_t header_index,
        DataTypePtr data_type,
        const std::unordered_map<std::string, std::pair<int, int>> & field_indices,
        std::unordered_set<int> & added_indices,
-        std::vector<int> & required_indices)
+        std::vector<ClickHouseIndexToParquetIndex> & required_indices,
+        const parquet::FileMetaData & file)
    {
        auto nested_type = removeNullable(data_type);
        if (const DB::DataTypeTuple * type_tuple = typeid_cast<const DB::DataTypeTuple *>(nested_type.get()))
@ -187,20 +201,20 @@ private:
                    if (ignore_case)
                        boost::to_lower(field_name);
                    const auto & field_type = field_types[i];
-                    findRequiredIndices(Nested::concatenateName(name, field_name), field_type, field_indices, added_indices, required_indices);
+                    findRequiredIndices(Nested::concatenateName(name, field_name), header_index, field_type, field_indices, added_indices, required_indices, file);
                }
                return;
            }
        }
        else if (const auto * type_array = typeid_cast<const DB::DataTypeArray *>(nested_type.get()))
        {
-            findRequiredIndices(name, type_array->getNestedType(), field_indices, added_indices, required_indices);
+            findRequiredIndices(name, header_index, type_array->getNestedType(), field_indices, added_indices, required_indices, file);
            return;
        }
        else if (const auto * type_map = typeid_cast<const DB::DataTypeMap *>(nested_type.get()))
        {
-            findRequiredIndices(name, type_map->getKeyType(), field_indices, added_indices, required_indices);
-            findRequiredIndices(name, type_map->getValueType(), field_indices, added_indices, required_indices);
+            findRequiredIndices(name, header_index, type_map->getKeyType(), field_indices, added_indices, required_indices, file);
+            findRequiredIndices(name, header_index, type_map->getValueType(), field_indices, added_indices, required_indices, file);
            return;
        }
        auto it = field_indices.find(name);
@ -211,14 +225,18 @@ private:
        }
        else
        {
+            ClickHouseIndexToParquetIndex index_mapping;
+            index_mapping.clickhouse_index = header_index;
            for (int j = 0; j < it->second.second; ++j)
            {
                auto index = it->second.first + j;
                if (added_indices.insert(index).second)
                {
-                    required_indices.emplace_back(index);
+                    index_mapping.parquet_indexes.emplace_back(index);
                }
            }
+
+            required_indices.emplace_back(index_mapping);
        }
    }
 };
--- a/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h
+++ b/src/Processors/Formats/Impl/JSONEachRowRowInputFormat.h
@ -43,6 +43,7 @@ private:

    size_t countRows(size_t max_block_size) override;
    bool supportsCountRows() const override { return true; }
+    bool supportsCustomSerializations() const override { return true; }

    const String & columnName(size_t i) const;
    size_t columnIndex(StringRef name, size_t key_index);
--- a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp
+++ b/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.cpp
@ -0,0 +1,525 @@
+#include <Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h>
+#include <iostream>
+
+#if USE_PARQUET
+
+#include <parquet/bloom_filter.h>
+#include <parquet/xxhasher.h>
+#include <Interpreters/convertFieldToType.h>
+#include <Columns/ColumnConst.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+
+bool isParquetStringTypeSupportedForBloomFilters(
+    const std::shared_ptr<const parquet::LogicalType> & logical_type,
+    parquet::ConvertedType::type converted_type)
+{
+    if (logical_type &&
+        !logical_type->is_none()
+        && !(logical_type->is_string() || logical_type->is_BSON() || logical_type->is_JSON()))
+    {
+        return false;
+    }
+
+    if (parquet::ConvertedType::type::NONE != converted_type &&
+        !(converted_type == parquet::ConvertedType::JSON || converted_type == parquet::ConvertedType::UTF8
+          || converted_type == parquet::ConvertedType::BSON))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+bool isParquetIntegerTypeSupportedForBloomFilters(const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
+{
+    if (logical_type && !logical_type->is_none() && !logical_type->is_int())
+    {
+        return false;
+    }
+
+    if (parquet::ConvertedType::type::NONE != converted_type && !(converted_type == parquet::ConvertedType::INT_8 || converted_type == parquet::ConvertedType::INT_16
+        || converted_type == parquet::ConvertedType::INT_32 || converted_type == parquet::ConvertedType::INT_64
+        || converted_type == parquet::ConvertedType::UINT_8 || converted_type == parquet::ConvertedType::UINT_16
+        || converted_type == parquet::ConvertedType::UINT_32 || converted_type == parquet::ConvertedType::UINT_64))
+    {
+        return false;
+    }
+
+    return true;
+}
+
+template <typename T>
+uint64_t hashSpecialFLBATypes(const Field & field)
+{
+    const T & value = field.safeGet<T>();
+
+    parquet::FLBA flba(reinterpret_cast<const uint8_t*>(&value));
+
+    parquet::XxHasher hasher;
+
+    return hasher.Hash(&flba, sizeof(T));
+};
+
+std::optional<uint64_t> tryHashStringWithoutCompatibilityCheck(const Field & field)
+{
+    const auto field_type = field.getType();
+
+    if (field_type != Field::Types::Which::String)
+    {
+        return std::nullopt;
+    }
+
+    parquet::XxHasher hasher;
+    parquet::ByteArray ba { field.safeGet<std::string>() };
+
+    return hasher.Hash(&ba);
+}
+
+std::optional<uint64_t> tryHashString(
+    const Field & field,
+    const std::shared_ptr<const parquet::LogicalType> & logical_type,
+    parquet::ConvertedType::type converted_type)
+{
+    if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
+    {
+        return std::nullopt;
+    }
+
+    return tryHashStringWithoutCompatibilityCheck(field);
+}
+
+std::optional<uint64_t> tryHashFLBA(
+    const Field & field,
+    const std::shared_ptr<const parquet::LogicalType> & logical_type,
+    parquet::ConvertedType::type converted_type,
+    std::size_t parquet_column_length)
+{
+    if (!isParquetStringTypeSupportedForBloomFilters(logical_type, converted_type))
+    {
+        return std::nullopt;
+    }
+
+    const auto field_type = field.getType();
+
+    if (field_type == Field::Types::Which::IPv6 && parquet_column_length == sizeof(IPv6))
+    {
+        return hashSpecialFLBATypes<IPv6>(field);
+    }
+
+    return tryHashStringWithoutCompatibilityCheck(field);
+}
+
+template <typename ParquetPhysicalType>
+std::optional<uint64_t> tryHashInt(const Field & field, const std::shared_ptr<const parquet::LogicalType> & logical_type, parquet::ConvertedType::type converted_type)
+{
+    if (!isParquetIntegerTypeSupportedForBloomFilters(logical_type, converted_type))
+    {
+        return std::nullopt;
+    }
+
+    parquet::XxHasher hasher;
+
+    if (field.getType() == Field::Types::Which::Int64)
+    {
+        return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<int64_t>()));
+    }
+    else if (field.getType() == Field::Types::Which::UInt64)
+    {
+        return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<uint64_t>()));
+    }
+    else if (field.getType() == Field::Types::IPv4)
+    {
+        /*
+         * In theory, we could accept IPv4 over 64 bits variables. It would only be a problem in case it was hashed using the byte array api
+         * with a zero-ed buffer that had a 32 bits variable copied into it.
+         *
+         * To be on the safe side, accept only in case physical type is 32 bits.
+         * */
+        if constexpr (std::is_same_v<int32_t, ParquetPhysicalType>)
+        {
+            return hasher.Hash(static_cast<ParquetPhysicalType>(field.safeGet<IPv4>()));
+        }
+    }
+
+    return std::nullopt;
+}
+
+std::optional<uint64_t> tryHash(const Field & field, const parquet::ColumnDescriptor * parquet_column_descriptor)
+{
+    const auto physical_type = parquet_column_descriptor->physical_type();
+    const auto & logical_type = parquet_column_descriptor->logical_type();
+    const auto converted_type = parquet_column_descriptor->converted_type();
+
+    switch (physical_type)
+    {
+        case parquet::Type::type::INT32:
+            return tryHashInt<int32_t>(field, logical_type, converted_type);
+        case parquet::Type::type::INT64:
+            return tryHashInt<int64_t>(field, logical_type, converted_type);
+        case parquet::Type::type::BYTE_ARRAY:
+            return tryHashString(field, logical_type, converted_type);
+        case parquet::Type::type::FIXED_LEN_BYTE_ARRAY:
+            return tryHashFLBA(field, logical_type, converted_type, parquet_column_descriptor->type_length());
+        default:
+            return std::nullopt;
+    }
+}
+
+std::optional<std::vector<uint64_t>> hash(const IColumn * data_column, const parquet::ColumnDescriptor * parquet_column_descriptor)
+{
+    std::vector<uint64_t> hashes;
+
+    for (size_t i = 0u; i < data_column->size(); i++)
+    {
+        Field f;
+        data_column->get(i, f);
+
+        auto hashed_value = tryHash(f, parquet_column_descriptor);
+
+        if (!hashed_value)
+        {
+            return std::nullopt;
+        }
+
+        hashes.emplace_back(*hashed_value);
+    }
+
+    return hashes;
+}
+
+bool maybeTrueOnBloomFilter(const std::vector<uint64_t> & hashes, const std::unique_ptr<parquet::BloomFilter> & bloom_filter)
+{
+    for (const auto hash : hashes)
+    {
+        if (bloom_filter->FindHash(hash))
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
+const parquet::ColumnDescriptor * getColumnDescriptorIfBloomFilterIsPresent(
+    const std::unique_ptr<parquet::RowGroupMetaData> & parquet_rg_metadata,
+    const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
+    std::size_t clickhouse_column_index)
+{
+    if (clickhouse_column_index_to_parquet_index.size() <= clickhouse_column_index)
+    {
+        return nullptr;
+    }
+
+    const auto & parquet_indexes = clickhouse_column_index_to_parquet_index[clickhouse_column_index].parquet_indexes;
+
+    // complex types like structs, tuples and maps will have more than one index.
+    // we don't support those for now
+    if (parquet_indexes.size() > 1)
+    {
+        return nullptr;
+    }
+
+    if (parquet_indexes.empty())
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Something bad happened, raise an issue and try the query with `input_format_parquet_bloom_filter_push_down=false`");
+    }
+
+    auto parquet_column_index = parquet_indexes[0];
+
+    const auto * parquet_column_descriptor = parquet_rg_metadata->schema()->Column(parquet_column_index);
+
+    bool column_has_bloom_filter = parquet_rg_metadata->ColumnChunk(parquet_column_index)->bloom_filter_offset().has_value();
+    if (!column_has_bloom_filter)
+    {
+        return nullptr;
+    }
+
+    return parquet_column_descriptor;
+}
+
+}
+
+ParquetBloomFilterCondition::ParquetBloomFilterCondition(const std::vector<ConditionElement> & condition_, const Block & header_)
+    : condition(condition_), header(header_)
+{
+}
+
+bool ParquetBloomFilterCondition::mayBeTrueOnRowGroup(const ColumnIndexToBF & column_index_to_column_bf) const
+{
+    using Function = ConditionElement::Function;
+    std::vector<BoolMask> rpn_stack;
+
+    for (const auto & element : condition)
+    {
+        if (element.function == Function::FUNCTION_IN
+            || element.function == Function::FUNCTION_NOT_IN)
+        {
+            bool maybe_true = true;
+            for (auto column_index = 0u; column_index < element.hashes_per_column.size(); column_index++)
+            {
+                // in case bloom filter is not present for this row group
+                // https://github.com/ClickHouse/ClickHouse/pull/62966#discussion_r1722361237
+                if (!column_index_to_column_bf.contains(element.key_columns[column_index]))
+                {
+                    rpn_stack.emplace_back(true, true);
+                    continue;
+                }
+
+                bool column_maybe_contains = maybeTrueOnBloomFilter(
+                    element.hashes_per_column[column_index],
+                    column_index_to_column_bf.at(element.key_columns[column_index]));
+
+                if (!column_maybe_contains)
+                {
+                    maybe_true = false;
+                    break;
+                }
+            }
+
+            rpn_stack.emplace_back(maybe_true, true);
+            if (element.function == Function::FUNCTION_NOT_IN)
+                rpn_stack.back() = !rpn_stack.back();
+        }
+        else if (element.function == Function::FUNCTION_NOT)
+        {
+            rpn_stack.back() = !rpn_stack.back();
+        }
+        else if (element.function == Function::FUNCTION_OR)
+        {
+            auto arg1 = rpn_stack.back();
+            rpn_stack.pop_back();
+            auto arg2 = rpn_stack.back();
+            rpn_stack.back() = arg1 | arg2;
+        }
+        else if (element.function == Function::FUNCTION_AND)
+        {
+            auto arg1 = rpn_stack.back();
+            rpn_stack.pop_back();
+            auto arg2 = rpn_stack.back();
+            rpn_stack.back() = arg1 & arg2;
+        }
+        else if (element.function == Function::ALWAYS_TRUE)
+        {
+            rpn_stack.emplace_back(true, false);
+        }
+        else if (element.function == Function::ALWAYS_FALSE)
+        {
+            rpn_stack.emplace_back(false, true);
+        }
+        else
+        {
+            rpn_stack.emplace_back(true, true);
+        }
+    }
+
+    if (rpn_stack.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected stack size in KeyCondition::mayBeTrueOnRowGroup");
+
+    return rpn_stack[0].can_be_true;
+}
+
+std::unordered_set<std::size_t> ParquetBloomFilterCondition::getFilteringColumnKeys() const
+{
+    std::unordered_set<std::size_t> column_keys;
+
+    for (const auto & element : condition)
+    {
+        for (const auto index : element.key_columns)
+        {
+            column_keys.insert(index);
+        }
+    }
+
+    return column_keys;
+}
+
+/*
+ * `KeyCondition::rpn` is overly complex for bloom filters, some operations are not even supported. Not only that, but to avoid hashing each time
+ * we loop over a rpn element, we need to store hashes instead of where predicate values. To address this, we loop over `KeyCondition::rpn`
+ * and build a simplified RPN that holds hashes instead of values.
+ *
+ * `KeyCondition::RPNElement::FUNCTION_IN_RANGE` becomes:
+ *      `FUNCTION_IN`
+ *      `FUNCTION_UNKNOWN` when range limits are different
+ * `KeyCondition::RPNElement::FUNCTION_IN_SET` becomes
+ *      `FUNCTION_IN`
+ *
+ * Complex types and structs are not supported.
+ * There are two sources of data types being analyzed, and they need to be compatible: DB::Field type and parquet type.
+ * This is determined by the `isColumnSupported` method.
+ *
+ * Some interesting examples:
+ * 1. file(..., 'str_column UInt64') where str_column = 50; Field.type == UInt64. Parquet type string. Not supported.
+ * 2. file(...) where str_column = 50; Field.type == String (conversion already taken care by `KeyCondition`). Parquet type string.
+ * 3. file(...) where uint32_column = toIPv4(5). Field.type == IPv4. Incompatible column types, resolved by `KeyCondition` itself.
+ * 4. file(...) where toIPv4(uint32_column) = toIPv4(5). Field.type == IPv4. We know it is safe to hash it using an int32 API.
+ * */
+std::vector<ParquetBloomFilterCondition::ConditionElement> keyConditionRPNToParquetBloomFilterCondition(
+    const std::vector<KeyCondition::RPNElement> & rpn,
+    const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
+    const std::unique_ptr<parquet::RowGroupMetaData> & parquet_rg_metadata)
+{
+    std::vector<ParquetBloomFilterCondition::ConditionElement> condition_elements;
+
+    using RPNElement = KeyCondition::RPNElement;
+    using Function = ParquetBloomFilterCondition::ConditionElement::Function;
+
+    for (const auto & rpn_element : rpn)
+    {
+        // this would be a problem for `where negate(x) = -58`.
+        // It would perform a bf search on `-58`, and possibly miss row groups containing this data.
+        if (!rpn_element.monotonic_functions_chain.empty())
+        {
+            condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
+            continue;
+        }
+
+        ParquetBloomFilterCondition::ConditionElement::HashesForColumns hashes;
+
+        if (rpn_element.function == RPNElement::FUNCTION_IN_RANGE
+            || rpn_element.function == RPNElement::FUNCTION_NOT_IN_RANGE)
+        {
+            // Only FUNCTION_EQUALS is supported and for that extremes need to be the same
+            if (rpn_element.range.left != rpn_element.range.right)
+            {
+                condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
+                continue;
+            }
+
+            const auto * parquet_column_descriptor =
+                getColumnDescriptorIfBloomFilterIsPresent(parquet_rg_metadata, clickhouse_column_index_to_parquet_index, rpn_element.key_column);
+
+            if (!parquet_column_descriptor)
+            {
+                condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
+                continue;
+            }
+
+            auto hashed_value = tryHash(rpn_element.range.left, parquet_column_descriptor);
+
+            if (!hashed_value)
+            {
+                condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
+                continue;
+            }
+
+            std::vector<uint64_t> hashes_for_column;
+            hashes_for_column.emplace_back(*hashed_value);
+
+            hashes.emplace_back(std::move(hashes_for_column));
+
+            auto function = rpn_element.function == RPNElement::FUNCTION_IN_RANGE
+                ? ParquetBloomFilterCondition::ConditionElement::Function::FUNCTION_IN
+                : ParquetBloomFilterCondition::ConditionElement::Function::FUNCTION_NOT_IN;
+
+            std::vector<std::size_t> key_columns;
+            key_columns.emplace_back(rpn_element.key_column);
+
+            condition_elements.emplace_back(function, std::move(hashes), std::move(key_columns));
+        }
+        else if (rpn_element.function == RPNElement::FUNCTION_IN_SET
+                 || rpn_element.function == RPNElement::FUNCTION_NOT_IN_SET)
+        {
+            const auto & set_index = rpn_element.set_index;
+            const auto & ordered_set = set_index->getOrderedSet();
+            const auto & indexes_mapping = set_index->getIndexesMapping();
+            bool found_empty_column = false;
+
+            std::vector<std::size_t> key_columns;
+
+            for (auto i = 0u; i < ordered_set.size(); i++)
+            {
+                const auto & set_column = ordered_set[i];
+
+                const auto * parquet_column_descriptor = getColumnDescriptorIfBloomFilterIsPresent(
+                    parquet_rg_metadata,
+                    clickhouse_column_index_to_parquet_index,
+                    indexes_mapping[i].key_index);
+
+                if (!parquet_column_descriptor)
+                {
+                    continue;
+                }
+
+                auto column = set_column;
+
+                if (column->empty())
+                {
+                    found_empty_column = true;
+                    break;
+                }
+
+                if (const auto & nullable_column = checkAndGetColumn<ColumnNullable>(set_column.get()))
+                {
+                    column = nullable_column->getNestedColumnPtr();
+                }
+
+                auto hashes_for_column_opt = hash(column.get(), parquet_column_descriptor);
+
+                if (!hashes_for_column_opt)
+                {
+                    continue;
+                }
+
+                auto & hashes_for_column = *hashes_for_column_opt;
+
+                if (hashes_for_column.empty())
+                {
+                    continue;
+                }
+
+                hashes.emplace_back(hashes_for_column);
+
+                key_columns.push_back(indexes_mapping[i].key_index);
+            }
+
+            if (found_empty_column)
+            {
+                condition_elements.emplace_back(Function::ALWAYS_FALSE);
+                continue;
+            }
+
+            if (hashes.empty())
+            {
+                condition_elements.emplace_back(Function::FUNCTION_UNKNOWN);
+                continue;
+            }
+
+            auto function = RPNElement::FUNCTION_IN_SET == rpn_element.function ? Function::FUNCTION_IN : Function::FUNCTION_NOT_IN;
+
+            condition_elements.emplace_back(function, hashes, key_columns);
+        }
+        else if (rpn_element.function == RPNElement::FUNCTION_NOT)
+        {
+            condition_elements.emplace_back(Function::FUNCTION_NOT);
+        }
+        else if (rpn_element.function == RPNElement::FUNCTION_OR)
+        {
+            condition_elements.emplace_back(Function::FUNCTION_OR);
+        }
+        else if (rpn_element.function == RPNElement::FUNCTION_AND)
+        {
+            condition_elements.emplace_back(Function::FUNCTION_AND);
+        }
+        else
+        {
+            condition_elements.emplace_back(Function::ALWAYS_TRUE);
+        }
+    }
+
+    return condition_elements;
+}
+
+}
+
+#endif
--- a/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h
+++ b/src/Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h
@ -0,0 +1,73 @@
+#pragma once
+
+#include <config.h>
+
+#if USE_PARQUET
+
+#include <Storages/MergeTree/KeyCondition.h>
+#include <parquet/metadata.h>
+#include <Processors/Formats/Impl/ArrowFieldIndexUtil.h>
+
+namespace parquet
+{
+class BloomFilter;
+}
+
+namespace DB
+{
+
+class ParquetBloomFilterCondition
+{
+public:
+
+    struct ConditionElement
+    {
+        enum Function
+        {
+            /// Atoms of a Boolean expression.
+            FUNCTION_IN,
+            FUNCTION_NOT_IN,
+            /// Can take any value.
+            FUNCTION_UNKNOWN,
+            /// Operators of the logical expression.
+            FUNCTION_NOT,
+            FUNCTION_AND,
+            FUNCTION_OR,
+            /// Constants
+            ALWAYS_FALSE,
+            ALWAYS_TRUE,
+        };
+
+        using ColumnPtr = IColumn::Ptr;
+        using HashesForColumns = std::vector<std::vector<uint64_t>>;
+        using KeyColumns = std::vector<std::size_t>;
+
+        Function function;
+        // each entry represents a list of hashes per column
+        // suppose there are three columns with 2 rows each
+        // hashes_per_column.size() == 3 and hashes_per_column[0].size() == 2
+        HashesForColumns hashes_per_column;
+        KeyColumns key_columns;
+    };
+
+    using RPNElement = KeyCondition::RPNElement;
+    using ColumnIndexToBF = std::unordered_map<std::size_t, std::unique_ptr<parquet::BloomFilter>>;
+
+    explicit ParquetBloomFilterCondition(const std::vector<ConditionElement> & condition_, const Block & header_);
+
+    bool mayBeTrueOnRowGroup(const ColumnIndexToBF & column_index_to_column_bf) const;
+    std::unordered_set<std::size_t> getFilteringColumnKeys() const;
+
+private:
+    std::vector<ParquetBloomFilterCondition::ConditionElement> condition;
+    Block header;
+};
+
+std::vector<ParquetBloomFilterCondition::ConditionElement> keyConditionRPNToParquetBloomFilterCondition(
+    const std::vector<KeyCondition::RPNElement> & rpn,
+    const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
+    const std::unique_ptr<parquet::RowGroupMetaData> & parquet_rg_metadata);
+
+}
+
+#endif
--- a/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParquetBlockInputFormat.cpp
@ -14,6 +14,8 @@
 #include <arrow/status.h>
 #include <parquet/arrow/reader.h>
 #include <parquet/arrow/schema.h>
+#include <parquet/bloom_filter.h>
+#include <parquet/bloom_filter_reader.h>
 #include <parquet/file_reader.h>
 #include <parquet/statistics.h>
 #include "ArrowBufferedStreams.h"
@ -25,6 +27,7 @@
 #include <DataTypes/DataTypeNullable.h>
 #include <Common/FieldVisitorsAccurateComparison.h>
 #include <Processors/Formats/Impl/Parquet/ParquetRecordReader.h>
+#include <Processors/Formats/Impl/Parquet/ParquetBloomFilterCondition.h>
 #include <Interpreters/convertFieldToType.h>

 namespace CurrentMetrics
@ -263,6 +266,50 @@ static Field decodePlainParquetValueSlow(const std::string & data, parquet::Type
    return field;
 }

+static ParquetBloomFilterCondition::ColumnIndexToBF buildColumnIndexToBF(
+    parquet::BloomFilterReader & bf_reader,
+    int row_group,
+    const std::vector<ArrowFieldIndexUtil::ClickHouseIndexToParquetIndex> & clickhouse_column_index_to_parquet_index,
+    const std::unordered_set<std::size_t> & filtering_columns
+)
+{
+    auto rg_bf = bf_reader.RowGroup(row_group);
+
+    if (!rg_bf)
+    {
+        return {};
+    }
+
+    ParquetBloomFilterCondition::ColumnIndexToBF index_to_column_bf;
+
+    for (const auto & [clickhouse_index, parquet_indexes] : clickhouse_column_index_to_parquet_index)
+    {
+        if (!filtering_columns.contains(clickhouse_index))
+        {
+            continue;
+        }
+
+        // Complex / nested types contain more than one index. We don't support those.
+        if (parquet_indexes.size() > 1)
+        {
+            continue;
+        }
+
+        auto parquet_index = parquet_indexes[0];
+
+        auto bf = rg_bf->GetColumnBloomFilter(parquet_index);
+
+        if (!bf)
+        {
+            continue;
+        }
+
+        index_to_column_bf[clickhouse_index] = std::move(bf);
+    }
+
+    return index_to_column_bf;
+}
+
 /// Range of values for each column, based on statistics in the Parquet metadata.
 /// This is lower/upper bounds, not necessarily exact min and max, e.g. the min/max can be just
 /// missing in the metadata.
@ -474,9 +521,27 @@ void ParquetBlockInputFormat::initializeIfNeeded()
    ArrowFieldIndexUtil field_util(
        format_settings.parquet.case_insensitive_column_matching,
        format_settings.parquet.allow_missing_columns);
-    column_indices = field_util.findRequiredIndices(getPort().getHeader(), *schema);
+
+    auto index_mapping = field_util.findRequiredIndices(getPort().getHeader(), *schema, *metadata);
+
+    for (const auto & [clickhouse_header_index, parquet_indexes] : index_mapping)
+    {
+        for (auto parquet_index : parquet_indexes)
+        {
+            column_indices.push_back(parquet_index);
+        }
+    }

    int num_row_groups = metadata->num_row_groups();
+
+    if (num_row_groups == 0)
+    {
+        return;
+    }
+
+    const auto bf_reader_properties = parquet::default_reader_properties();
+    std::unique_ptr<parquet::BloomFilterReader> bf_reader;
+
    row_group_batches.reserve(num_row_groups);

    auto adaptive_chunk_size = [&](int row_group_idx) -> size_t
@ -497,11 +562,38 @@ void ParquetBlockInputFormat::initializeIfNeeded()
        return std::min(std::max(preferred_num_rows, MIN_ROW_NUM), static_cast<size_t>(format_settings.parquet.max_block_size));
    };

+    std::unique_ptr<ParquetBloomFilterCondition> parquet_bloom_filter_condition;
+
+    std::unordered_set<std::size_t> filtering_columns;
+
+    if (format_settings.parquet.bloom_filter_push_down && key_condition)
+    {
+        bf_reader = parquet::BloomFilterReader::Make(arrow_file, metadata, bf_reader_properties, nullptr);
+
+        const auto parquet_conditions = keyConditionRPNToParquetBloomFilterCondition(
+            key_condition->getRPN(),
+            index_mapping,
+            metadata->RowGroup(0));
+        parquet_bloom_filter_condition = std::make_unique<ParquetBloomFilterCondition>(parquet_conditions, getPort().getHeader());
+
+        filtering_columns = parquet_bloom_filter_condition->getFilteringColumnKeys();
+    }
+
    for (int row_group = 0; row_group < num_row_groups; ++row_group)
    {
        if (skip_row_groups.contains(row_group))
            continue;

+        if (parquet_bloom_filter_condition)
+        {
+            const auto column_index_to_bf = buildColumnIndexToBF(*bf_reader, row_group, index_mapping, filtering_columns);
+
+            if (!parquet_bloom_filter_condition->mayBeTrueOnRowGroup(column_index_to_bf))
+            {
+                continue;
+            }
+        }
+
        if (format_settings.parquet.filter_push_down && key_condition
            && !key_condition
                    ->checkInHyperrectangle(
--- a/src/Processors/Formats/Impl/TSKVRowInputFormat.h
+++ b/src/Processors/Formats/Impl/TSKVRowInputFormat.h
@ -38,6 +38,7 @@ private:

    bool supportsCountRows() const override { return true; }
    size_t countRows(size_t max_block_size) override;
+    bool supportsCustomSerializations() const override { return true; }

    const FormatSettings format_settings;

--- a/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h
+++ b/src/Processors/Formats/RowInputFormatWithNamesAndTypes.h
@ -48,6 +48,7 @@ protected:
    bool isGarbageAfterField(size_t index, ReadBuffer::Position pos) override;
    void setReadBuffer(ReadBuffer & in_) override;
    void readPrefix() override;
+    bool supportsCustomSerializations() const override { return true; }

    const FormatSettings format_settings;
    DataTypes data_types;
--- a/src/Storages/MergeTree/KeyCondition.cpp
+++ b/src/Storages/MergeTree/KeyCondition.cpp
@ -817,7 +817,7 @@ void KeyCondition::getAllSpaceFillingCurves()
 KeyCondition::KeyCondition(
    const ActionsDAG * filter_dag,
    ContextPtr context,
-    const Names & key_column_names,
+    const Names & key_column_names_,
    const ExpressionActionsPtr & key_expr_,
    bool single_point_)
    : key_expr(key_expr_)
@ -825,7 +825,7 @@ KeyCondition::KeyCondition(
    , single_point(single_point_)
 {
    size_t key_index = 0;
-    for (const auto & name : key_column_names)
+    for (const auto & name : key_column_names_)
    {
        if (!key_columns.contains(name))
        {
--- a/src/Storages/StorageFuzzJSON.cpp
+++ b/src/Storages/StorageFuzzJSON.cpp
@ -149,7 +149,6 @@ void traverse(const ParserImpl::Element & e, std::shared_ptr<JSONNode> node)

 std::shared_ptr<JSONNode> parseJSON(const String & json)
 {
-    std::string_view view{json.begin(), json.end()};
    ParserImpl::Element document;
    ParserImpl p;

--- a/src/Storages/StorageTableFunction.h
+++ b/src/Storages/StorageTableFunction.h
@ -62,6 +62,7 @@ public:
    /// Avoid loading nested table by returning nullptr/false for all table functions.
    StoragePolicyPtr getStoragePolicy() const override { return nullptr; }
    bool storesDataOnDisk() const override { return false; }
+    bool supportsReplication() const override { return false; }

    void startup() override { }
    void shutdown(bool is_drop) override
--- a/tests/integration/test_storage_delta/configs/config.d/remote_servers.xml
+++ b/tests/integration/test_storage_delta/configs/config.d/remote_servers.xml
@ -0,0 +1,16 @@
+<clickhouse>
+    <remote_servers>
+        <cluster>
+            <shard>
+                <replica>
+                    <host>node1</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>node2</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </cluster>
+    </remote_servers>
+</clickhouse>
--- a/tests/integration/test_storage_delta/test.py
+++ b/tests/integration/test_storage_delta/test.py
@ -35,6 +35,7 @@ from pyspark.sql.window import Window

 import helpers.client
 from helpers.cluster import ClickHouseCluster
+from helpers.network import PartitionManager
 from helpers.s3_tools import (
    get_file_contents,
    list_s3_objects,
@ -74,10 +75,23 @@ def started_cluster():
            main_configs=[
                "configs/config.d/named_collections.xml",
                "configs/config.d/filesystem_caches.xml",
+                "configs/config.d/remote_servers.xml",
            ],
            user_configs=["configs/users.d/users.xml"],
            with_minio=True,
            stay_alive=True,
+            with_zookeeper=True,
+        )
+        cluster.add_instance(
+            "node2",
+            main_configs=[
+                "configs/config.d/named_collections.xml",
+                "configs/config.d/remote_servers.xml",
+            ],
+            user_configs=["configs/users.d/users.xml"],
+            with_minio=True,
+            stay_alive=True,
+            with_zookeeper=True,
        )

        logging.info("Starting cluster...")
@ -891,3 +905,100 @@ def test_filesystem_cache(started_cluster, storage_type):
            f"SELECT ProfileEvents['S3GetObject'] FROM system.query_log WHERE query_id = '{query_id}' AND type = 'QueryFinish'"
        )
    )
+
+
+def test_replicated_database_and_unavailable_s3(started_cluster):
+    node1 = started_cluster.instances["node1"]
+    node2 = started_cluster.instances["node2"]
+
+    DB_NAME = randomize_table_name("db")
+    TABLE_NAME = randomize_table_name("test_replicated_database_and_unavailable_s3")
+    minio_client = started_cluster.minio_client
+    bucket = started_cluster.minio_restricted_bucket
+
+    if not minio_client.bucket_exists(bucket):
+        minio_client.make_bucket(bucket)
+
+    node1.query(
+        f"CREATE DATABASE {DB_NAME} ENGINE=Replicated('/clickhouse/databases/{DB_NAME}', 'shard1', 'node1')"
+    )
+    node2.query(
+        f"CREATE DATABASE {DB_NAME} ENGINE=Replicated('/clickhouse/databases/{DB_NAME}', 'shard1', 'node2')"
+    )
+
+    parquet_data_path = create_initial_data_file(
+        started_cluster,
+        node1,
+        "SELECT number, toString(number) FROM numbers(100)",
+        TABLE_NAME,
+    )
+
+    endpoint_url = f"http://{started_cluster.minio_ip}:{started_cluster.minio_port}"
+    aws_access_key_id = "minio"
+    aws_secret_access_key = "minio123"
+
+    schema = pa.schema(
+        [
+            ("id", pa.int32()),
+            ("name", pa.string()),
+        ]
+    )
+
+    data = [
+        pa.array([1, 2, 3], type=pa.int32()),
+        pa.array(["John Doe", "Jane Smith", "Jake Johnson"], type=pa.string()),
+    ]
+    storage_options = {
+        "AWS_ENDPOINT_URL": endpoint_url,
+        "AWS_ACCESS_KEY_ID": aws_access_key_id,
+        "AWS_SECRET_ACCESS_KEY": aws_secret_access_key,
+        "AWS_ALLOW_HTTP": "true",
+        "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
+    }
+    path = f"s3://root/{TABLE_NAME}"
+    table = pa.Table.from_arrays(data, schema=schema)
+
+    write_deltalake(path, table, storage_options=storage_options)
+
+    with PartitionManager() as pm:
+        pm_rule_reject = {
+            "probability": 1,
+            "destination": node2.ip_address,
+            "source_port": started_cluster.minio_port,
+            "action": "REJECT --reject-with tcp-reset",
+        }
+        pm_rule_drop_all = {
+            "destination": node2.ip_address,
+            "source_port": started_cluster.minio_port,
+            "action": "DROP",
+        }
+        pm._add_rule(pm_rule_reject)
+
+        node1.query(
+            f"""
+            DROP TABLE IF EXISTS {DB_NAME}.{TABLE_NAME};
+            CREATE TABLE {DB_NAME}.{TABLE_NAME}
+            AS deltaLake('http://{started_cluster.minio_ip}:{started_cluster.minio_port}/root/{TABLE_NAME}' , 'minio', 'minio123')
+            """
+        )
+
+        assert TABLE_NAME in node1.query(
+            f"select name from system.tables where database = '{DB_NAME}'"
+        )
+        assert TABLE_NAME in node2.query(
+            f"select name from system.tables where database = '{DB_NAME}'"
+        )
+
+        replica_path = f"/clickhouse/databases/{DB_NAME}/replicas/shard1|node2"
+        zk = started_cluster.get_kazoo_client("zoo1")
+        zk.set(replica_path + "/digest", "123456".encode())
+
+        assert "123456" in node2.query(
+            f"SELECT * FROM system.zookeeper WHERE path = '{replica_path}'"
+        )
+
+        node2.restart_clickhouse()
+
+        assert "123456" not in node2.query(
+            f"SELECT * FROM system.zookeeper WHERE path = '{replica_path}'"
+        )
--- a/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.reference
+++ b/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.reference
@ -0,0 +1,347 @@
+1000
+bloom filter is off, all row groups should be read
+expect rows_read = select count()
+{
+  "data": [
+    {
+      "string": "AZSR",
+      "flba": "WNMM"
+    },
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    }
+  ],
+  "rows": 2,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+bloom filter is on, some row groups should be skipped
+expect rows_read much less than select count()
+{
+  "data": [
+    {
+      "string": "AZSR",
+      "flba": "WNMM"
+    },
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    }
+  ],
+  "rows": 2,
+  "statistics": {
+    "rows_read": 464,
+    "bytes_read": 21703
+  }
+}
+bloom filter is on, but where predicate contains data from 2 row groups out of 3.
+Rows read should be less than select count, but greater than previous selects
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    },
+    {
+      "string": "ZHZK",
+      "flba": "HRWD"
+    }
+  ],
+  "rows": 2,
+  "statistics": {
+    "rows_read": 536,
+    "bytes_read": 25708
+  }
+}
+bloom filter is on, but where predicate contains data from all row groups
+expect rows_read = select count()
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    },
+    {
+      "string": "OKAI",
+      "flba": "UXGT"
+    },
+    {
+      "string": "ZHZK",
+      "flba": "HRWD"
+    }
+  ],
+  "rows": 3,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+IN check
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    },
+    {
+      "string": "ZHZK",
+      "flba": "HRWD"
+    }
+  ],
+  "rows": 2,
+  "statistics": {
+    "rows_read": 536,
+    "bytes_read": 25708
+  }
+}
+tuple in case, bf is off.
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+tuple in case, bf is on.
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 464,
+    "bytes_read": 21703
+  }
+}
+complex tuple in case, bf is off
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+complex tuple in case, bf is on
+{
+  "data": [
+    {
+      "string": "PFJH",
+      "flba": "GKJC"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 464,
+    "bytes_read": 21703
+  }
+}
+complex tuple in case, bf is on. Non existent
+{
+  "data": [],
+  "rows": 0,
+  "statistics": {
+    "rows_read": 0,
+    "bytes_read": 0
+  }
+}
+Bloom filter for json column. BF is off
+{
+  "data": [
+    {
+      "json": "{\"key\":38, \"value\":\"NXONM\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+Bloom filter for json column. BF is on
+{
+  "data": [
+    {
+      "json": "{\"key\":38, \"value\":\"NXONM\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 72,
+    "bytes_read": 4005
+  }
+}
+Bloom filter for ipv4 column. BF is off
+{
+  "data": [
+    {
+      "json": "{\"key\":38, \"value\":\"NXONM\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+Bloom filter for ipv4 column. BF is on
+{
+  "data": [
+    {
+      "json": "{\"key\":38, \"value\":\"NXONM\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 72,
+    "bytes_read": 4005
+  }
+}
+Bloom filter for ipv4 column. BF is on. Specified in the schema
+{
+  "data": [
+    {
+      "ipv4": "0.0.1.143"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 72,
+    "bytes_read": 4005
+  }
+}
+Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg
+{
+  "data": [
+    {
+      "uint64_logical": "22.230.220.164"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+BF off for parquet uint64 logical type. Should read everything
+{
+  "data": [
+    {
+      "json": "{\"key\":683, \"value\":\"YKCPD\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+BF on for parquet uint64 logical type. Uint64 is stored as a signed int 64, but with logical annotation. Make sure a value greater than int64 can be queried
+{
+  "data": [
+    {
+      "json": "{\"key\":683, \"value\":\"YKCPD\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 464,
+    "bytes_read": 21711
+  }
+}
+Uint16 is stored as physical type int32 with bidwidth = 16  and sign = false. Make sure a value greater than int16 can be queried. BF is on.
+{
+  "data": [
+    {
+      "json": "{\"key\":874, \"value\":\"JENHW\"}"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 464,
+    "bytes_read": 21703
+  }
+}
+BF off for parquet int8 logical type. Should read everything
+{
+  "data": [
+    {
+      "json": "{\"key\":89, \"value\":\"MFIYP\"}"
+    },
+    {
+      "json": "{\"key\":321, \"value\":\"JNOIA\"}"
+    },
+    {
+      "json": "{\"key\":938, \"value\":\"UBMLO\"}"
+    },
+    {
+      "json": "{\"key\":252, \"value\":\"ZVLKF\"}"
+    }
+  ],
+  "rows": 4,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+BF on for parquet int8 logical type. Should skip row groups
+{
+  "data": [
+    {
+      "json": "{\"key\":89, \"value\":\"MFIYP\"}"
+    },
+    {
+      "json": "{\"key\":321, \"value\":\"JNOIA\"}"
+    },
+    {
+      "json": "{\"key\":938, \"value\":\"UBMLO\"}"
+    },
+    {
+      "json": "{\"key\":252, \"value\":\"ZVLKF\"}"
+    }
+  ],
+  "rows": 4,
+  "statistics": {
+    "rows_read": 536,
+    "bytes_read": 25716
+  }
+}
+Invalid column conversion with in operation. String type can not be hashed against parquet int64 physical type. Should read everything
+{
+  "data": [],
+  "rows": 0,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
+Transformations on key column shall not be allowed. Should read everything
+{
+  "data": [
+    {
+      "uint64_logical": "7711695863945021976"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 1000,
+    "bytes_read": 47419
+  }
+}
--- a/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh
+++ b/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down.sh
@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+# Tags: no-ubsan, no-fasttest
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+
+USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+
+WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}"
+
+mkdir -p "${WORKING_DIR}"
+
+DATA_FILE="${CUR_DIR}/data_parquet/multi_column_bf.gz.parquet"
+
+DATA_FILE_USER_PATH="${WORKING_DIR}/multi_column_bf.gz.parquet"
+
+cp ${DATA_FILE} ${DATA_FILE_USER_PATH}
+
+${CLICKHOUSE_CLIENT} --query="select count(*) from file('${DATA_FILE_USER_PATH}', Parquet) SETTINGS use_cache_for_count_from_files=false;"
+
+echo "bloom filter is off, all row groups should be read"
+echo "expect rows_read = select count()"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "bloom filter is on, some row groups should be skipped"
+echo "expect rows_read much less than select count()"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or flba='WNMM' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "bloom filter is on, but where predicate contains data from 2 row groups out of 3."
+echo "Rows read should be less than select count, but greater than previous selects"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+echo "bloom filter is on, but where predicate contains data from all row groups"
+echo "expect rows_read = select count()"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string='PFJH' or string='ZHZK' or uint64_logical=18441251162536403933 order by uint16_logical asc Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+echo "IN check"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where string in ('PFJH', 'ZHZK') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "tuple in case, bf is off."
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "tuple in case, bf is on."
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in ('PFJH', 'GKJC') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "complex tuple in case, bf is off"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "complex tuple in case, bf is on"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('PFJH', 'GKJC'), ('NON2', 'NON2')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "complex tuple in case, bf is on. Non existent"
+${CLICKHOUSE_CLIENT} --query="select string, flba from file('${DATA_FILE_USER_PATH}', Parquet) where (string, flba) in (('NON1', 'NON1'), ('NON2', 'NON2'), ('NON3', 'NON3')) order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Bloom filter for json column. BF is off"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Bloom filter for json column. BF is on"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where json = '{\"key\":38, \"value\":\"NXONM\"}' order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Bloom filter for ipv4 column. BF is off"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Bloom filter for ipv4 column. BF is on"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where ipv4 = IPv4StringToNum('0.0.1.143') order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Bloom filter for ipv4 column. BF is on. Specified in the schema"
+${CLICKHOUSE_CLIENT} --query="select ipv4 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv4 IPv4') where ipv4 = toIPv4('0.0.1.143') order by ipv4 asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Bloom filter on 64 bit column read as ipv4. We explicitly deny it, should read all rg"
+${CLICKHOUSE_CLIENT} --query="select uint64_logical from file ('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical IPv4') where uint64_logical = toIPv4(5552715629697883300) order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+echo "BF off for parquet uint64 logical type. Should read everything"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "BF on for parquet uint64 logical type. Uint64 is stored as a signed int 64, but with logical annotation. Make sure a value greater than int64 can be queried"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint64_logical=18441251162536403933 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Uint16 is stored as physical type int32 with bidwidth = 16  and sign = false. Make sure a value greater than int16 can be queried. BF is on."
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where uint16_logical=65528 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "BF off for parquet int8 logical type. Should read everything"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "BF on for parquet int8 logical type. Should skip row groups"
+${CLICKHOUSE_CLIENT} --query="select json from file('${DATA_FILE_USER_PATH}', Parquet) where int8_logical=-126 order by uint16_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;"  | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Invalid column conversion with in operation. String type can not be hashed against parquet int64 physical type. Should read everything"
+${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet, 'uint64_logical String') where uint64_logical in ('5') order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+echo "Transformations on key column shall not be allowed. Should read everything"
+${CLICKHOUSE_CLIENT} --query="select uint64_logical from file('${DATA_FILE_USER_PATH}', Parquet) where negate(uint64_logical) = -7711695863945021976 order by uint64_logical asc FORMAT Json SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+rm -rf ${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME:?}/*
--- a/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down_ipv6.reference
+++ b/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down_ipv6.reference
@ -0,0 +1,76 @@
+bloom filter is off, row groups should be read
+expect rows_read = select count()
+{
+  "data": [
+    {
+      "ipv6": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 5,
+    "bytes_read": 128
+  }
+}
+bloom filter is on for ipv6, row groups should also be read since there is only one. Below queries just make sure the data is properly returned
+{
+  "data": [
+    {
+      "ipv6": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 5,
+    "bytes_read": 128
+  }
+}
+{
+  "data": [
+    {
+      "ipv6": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 5,
+    "bytes_read": 128
+  }
+}
+{
+  "data": [
+    {
+      "toIPv6(ipv6)": "7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995"
+    }
+  ],
+  "rows": 1,
+  "statistics": {
+    "rows_read": 5,
+    "bytes_read": 128
+  }
+}
+non existent ipv6, row group should be skipped
+{
+  "data": [],
+  "rows": 0,
+  "statistics": {
+    "rows_read": 0,
+    "bytes_read": 0
+  }
+}
+{
+  "data": [],
+  "rows": 0,
+  "statistics": {
+    "rows_read": 0,
+    "bytes_read": 0
+  }
+}
+{
+  "data": [],
+  "rows": 0,
+  "statistics": {
+    "rows_read": 5,
+    "bytes_read": 128
+  }
+}
--- a/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down_ipv6.sh
+++ b/tests/queries/0_stateless/03036_test_parquet_bloom_filter_push_down_ipv6.sh
@ -0,0 +1,33 @@
+#!/usr/bin/env bash
+# Tags: no-ubsan, no-fasttest
+
+CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CUR_DIR"/../shell_config.sh
+
+
+USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
+
+WORKING_DIR="${USER_FILES_PATH}/${CLICKHOUSE_TEST_UNIQUE_NAME}"
+
+mkdir -p "${WORKING_DIR}"
+
+DATA_FILE="${CUR_DIR}/data_parquet/ipv6_bloom_filter.gz.parquet"
+
+DATA_FILE_USER_PATH="${WORKING_DIR}/ipv6_bloom_filter.gz.parquet"
+
+cp ${DATA_FILE} ${DATA_FILE_USER_PATH}
+
+echo "bloom filter is off, row groups should be read"
+echo "expect rows_read = select count()"
+${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = '7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=false, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+echo "bloom filter is on for ipv6, row groups should also be read since there is only one. Below queries just make sure the data is properly returned"
+${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = '7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = toIPv6('7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where ipv6 = toIPv6('7afe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+
+echo "non existent ipv6, row group should be skipped"
+${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = 'fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995' Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+${CLICKHOUSE_CLIENT} --query="select ipv6 from file('${DATA_FILE_USER_PATH}', Parquet, 'ipv6 IPv6') where ipv6 = toIPv6('fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
+${CLICKHOUSE_CLIENT} --query="select toIPv6(ipv6) from file('${DATA_FILE_USER_PATH}', Parquet) where ipv6 = toIPv6('fafe:b9d4:e754:4e78:8783:37f5:b2ea:9995') Format JSON SETTINGS input_format_parquet_bloom_filter_push_down=true, input_format_parquet_filter_push_down=false;" | jq 'del(.meta,.statistics.elapsed)'
--- a/tests/queries/0_stateless/03251_insert_sparse_all_formats.reference
+++ b/tests/queries/0_stateless/03251_insert_sparse_all_formats.reference
@ -0,0 +1,102 @@
+Arrow
+9260153077572524277
+ArrowStream
+9260153077572524277
+Avro
+9260153077572524277
+BSONEachRow
+9260153077572524277
+CSV
+9260153077572524277
+CSVWithNames
+9260153077572524277
+CSVWithNamesAndTypes
+9260153077572524277
+CapnProto
+9260153077572524277
+CustomSeparated
+9260153077572524277
+CustomSeparatedWithNames
+9260153077572524277
+CustomSeparatedWithNamesAndTypes
+9260153077572524277
+JSON
+9260153077572524277
+JSONColumns
+9260153077572524277
+JSONColumnsWithMetadata
+9260153077572524277
+JSONCompact
+9260153077572524277
+JSONCompactColumns
+9260153077572524277
+JSONCompactEachRow
+9260153077572524277
+JSONCompactEachRowWithNames
+9260153077572524277
+JSONCompactEachRowWithNamesAndTypes
+9260153077572524277
+JSONCompactStringsEachRow
+9260153077572524277
+JSONCompactStringsEachRowWithNames
+9260153077572524277
+JSONCompactStringsEachRowWithNamesAndTypes
+9260153077572524277
+JSONEachRow
+9260153077572524277
+JSONLines
+9260153077572524277
+JSONObjectEachRow
+9260153077572524277
+JSONStringsEachRow
+9260153077572524277
+MsgPack
+9260153077572524277
+NDJSON
+9260153077572524277
+Native
+9260153077572524277
+ORC
+9260153077572524277
+Parquet
+9260153077572524277
+Raw
+9260153077572524277
+RawWithNames
+9260153077572524277
+RawWithNamesAndTypes
+9260153077572524277
+RowBinary
+9260153077572524277
+RowBinaryWithNames
+9260153077572524277
+RowBinaryWithNamesAndTypes
+9260153077572524277
+TSKV
+9260153077572524277
+TSV
+9260153077572524277
+TSVRaw
+9260153077572524277
+TSVRawWithNames
+9260153077572524277
+TSVRawWithNamesAndTypes
+9260153077572524277
+TSVWithNames
+9260153077572524277
+TSVWithNamesAndTypes
+9260153077572524277
+TabSeparated
+9260153077572524277
+TabSeparatedRaw
+9260153077572524277
+TabSeparatedRawWithNames
+9260153077572524277
+TabSeparatedRawWithNamesAndTypes
+9260153077572524277
+TabSeparatedWithNames
+9260153077572524277
+TabSeparatedWithNamesAndTypes
+9260153077572524277
+Values
+9260153077572524277
--- a/tests/queries/0_stateless/03251_insert_sparse_all_formats.sh
+++ b/tests/queries/0_stateless/03251_insert_sparse_all_formats.sh
@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest, long
+
+set -e
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+formats=$($CLICKHOUSE_CLIENT --query "
+    SELECT name FROM system.formats
+    WHERE is_input AND is_output AND name NOT IN ('Template', 'Npy', 'RawBLOB', 'ProtobufList', 'ProtobufSingle', 'Protobuf', 'LineAsString')
+    ORDER BY name FORMAT TSV
+")
+
+$CLICKHOUSE_CLIENT --query "
+    DROP TABLE IF EXISTS t_sparse_all_formats;
+    CREATE TABLE t_sparse_all_formats (a UInt64, b UInt64, c String) ENGINE = MergeTree ORDER BY a;
+"
+
+for format in $formats; do
+    echo $format
+    $CLICKHOUSE_CLIENT --query "INSERT INTO t_sparse_all_formats(a) SELECT number FROM numbers(1000)"
+
+    $CLICKHOUSE_CLIENT --query "SELECT number AS a, 0::UInt64 AS b, '' AS c FROM numbers(1000) FORMAT $format" \
+        | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_sparse_all_formats+FORMAT+$format&enable_parsing_to_custom_serialization=1" --data-binary @-
+
+    $CLICKHOUSE_CLIENT --query "SELECT number AS a FROM numbers(1000) FORMAT $format" \
+        | ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&query=INSERT+INTO+t_sparse_all_formats(a)+FORMAT+$format&enable_parsing_to_custom_serialization=1" --data-binary @-
+
+    $CLICKHOUSE_CLIENT --query "
+        SELECT sum(sipHash64(*)) FROM t_sparse_all_formats;
+        TRUNCATE TABLE t_sparse_all_formats;
+    "
+done
--- a/tests/queries/0_stateless/data_parquet/ipv6_bloom_filter.gz.parquet
+++ b/tests/queries/0_stateless/data_parquet/ipv6_bloom_filter.gz.parquet
--- a/tests/queries/0_stateless/data_parquet/multi_column_bf.gz.parquet
+++ b/tests/queries/0_stateless/data_parquet/multi_column_bf.gz.parquet