Merge pull request #39418 from vdimir/join_and_sets

Filter joined streams for `full_sorting_join` by each other before sorting
2024-11-28 02:21:59 +00:00 · 2022-09-02 13:57:06 +02:00 · 2022-09-02 13:57:06 +02:00 · 963c0111bf
commit 963c0111bf
parent 6815d79102 12e6fc4182
26 changed files with 1155 additions and 53 deletions
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -366,6 +366,8 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(UInt64, partial_merge_join_left_table_buffer_bytes, 0, "If not 0 group left table blocks in bigger ones for left-side table in partial merge join. It uses up to 2x of specified memory per joining thread.", 0) \
    M(UInt64, partial_merge_join_rows_in_right_blocks, 65536, "Split right-hand joining data in blocks of specified size. It's a portion of data indexed by min-max values and possibly unloaded on disk.", 0) \
    M(UInt64, join_on_disk_max_files_to_merge, 64, "For MergeJoin on disk set how much files it's allowed to sort simultaneously. Then this value bigger then more memory used and then less disk I/O needed. Minimum is 2.", 0) \
+    M(UInt64, max_rows_in_set_to_optimize_join, 100'000, "Maximal size of the set to filter joined tables by each other row sets before joining. 0 - disable.", 0) \
+    \
    M(Bool, compatibility_ignore_collation_in_create_table, true, "Compatibility ignore collation in create table", 0) \
    \
    M(String, temporary_files_codec, "LZ4", "Set compression codec for temporary files (sort and join on disk). I.e. LZ4, NONE.", 0) \
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -39,6 +39,7 @@
 #include <QueryPipeline/Pipe.h>
 #include <Processors/QueryPlan/AggregatingStep.h>
 #include <Processors/QueryPlan/ArrayJoinStep.h>
+#include <Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h>
 #include <Processors/QueryPlan/CreatingSetsStep.h>
 #include <Processors/QueryPlan/CubeStep.h>
 #include <Processors/QueryPlan/DistinctStep.h>
@ -1436,7 +1437,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<P
                    if (!joined_plan)
                        throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no joined plan for query");

-                    auto add_sorting = [&settings, this] (QueryPlan & plan, const Names & key_names, bool is_right)
+                    auto add_sorting = [&settings, this] (QueryPlan & plan, const Names & key_names, JoinTableSide join_pos)
                    {
                        SortDescription order_descr;
                        order_descr.reserve(key_names.size());
@ -1455,15 +1456,43 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, std::optional<P
                            this->context->getTemporaryVolume(),
                            settings.min_free_disk_space_for_temporary_data,
                            settings.optimize_sorting_by_input_stream_properties);
-                        sorting_step->setStepDescription(fmt::format("Sort {} before JOIN", is_right ? "right" : "left"));
+                        sorting_step->setStepDescription(fmt::format("Sort {} before JOIN", join_pos));
                        plan.addStep(std::move(sorting_step));
                    };

+                    auto crosswise_connection = CreateSetAndFilterOnTheFlyStep::createCrossConnection();
+                    auto add_create_set = [&settings, crosswise_connection](QueryPlan & plan, const Names & key_names, JoinTableSide join_pos)
+                    {
+                        auto creating_set_step = std::make_unique<CreateSetAndFilterOnTheFlyStep>(
+                            plan.getCurrentDataStream(), key_names, settings.max_rows_in_set_to_optimize_join, crosswise_connection, join_pos);
+                        creating_set_step->setStepDescription(fmt::format("Create set and filter {} joined stream", join_pos));
+
+                        auto * step_raw_ptr = creating_set_step.get();
+                        plan.addStep(std::move(creating_set_step));
+                        return step_raw_ptr;
+                    };
+
                    if (expressions.join->pipelineType() == JoinPipelineType::YShaped)
                    {
-                        const auto & join_clause = expressions.join->getTableJoin().getOnlyClause();
-                        add_sorting(query_plan, join_clause.key_names_left, false);
-                        add_sorting(*joined_plan, join_clause.key_names_right, true);
+                        const auto & table_join = expressions.join->getTableJoin();
+                        const auto & join_clause = table_join.getOnlyClause();
+
+                        auto join_kind = table_join.kind();
+                        bool kind_allows_filtering = isInner(join_kind) || isLeft(join_kind) || isRight(join_kind);
+                        if (settings.max_rows_in_set_to_optimize_join > 0 && kind_allows_filtering)
+                        {
+                            auto * left_set = add_create_set(query_plan, join_clause.key_names_left, JoinTableSide::Left);
+                            auto * right_set = add_create_set(*joined_plan, join_clause.key_names_right, JoinTableSide::Right);
+
+                            if (isInnerOrLeft(join_kind))
+                                right_set->setFiltering(left_set->getSet());
+
+                            if (isInnerOrRight(join_kind))
+                                left_set->setFiltering(right_set->getSet());
+                        }
+
+                        add_sorting(query_plan, join_clause.key_names_left, JoinTableSide::Left);
+                        add_sorting(*joined_plan, join_clause.key_names_right, JoinTableSide::Right);
                    }

                    QueryPlanStepPtr join_step = std::make_unique<JoinStep>(
--- a/src/Interpreters/Set.cpp
+++ b/src/Interpreters/Set.cpp
@ -22,6 +22,8 @@
 #include <Interpreters/castColumn.h>
 #include <Interpreters/Context.h>

+#include <Processors/Chunk.h>
+
 #include <Storages/MergeTree/KeyCondition.h>

 #include <base/range.h>
@ -162,8 +164,16 @@ void Set::setHeader(const ColumnsWithTypeAndName & header)
    data.init(data.chooseMethod(key_columns, key_sizes));
 }

-
 bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns)
+{
+    Columns cols;
+    cols.reserve(columns.size());
+    for (const auto & column : columns)
+        cols.emplace_back(column.column);
+    return insertFromBlock(cols);
+}
+
+bool Set::insertFromBlock(const Columns & columns)
 {
    std::lock_guard<std::shared_mutex> lock(rwlock);

@ -179,11 +189,11 @@ bool Set::insertFromBlock(const ColumnsWithTypeAndName & columns)
    /// Remember the columns we will work with
    for (size_t i = 0; i < keys_size; ++i)
    {
-        materialized_columns.emplace_back(columns.at(i).column->convertToFullIfNeeded());
+        materialized_columns.emplace_back(columns.at(i)->convertToFullIfNeeded());
        key_columns.emplace_back(materialized_columns.back().get());
    }

-    size_t rows = columns.at(0).column->size();
+    size_t rows = columns.at(0)->size();

    /// We will insert to the Set only keys, where all components are not NULL.
    ConstNullMapPtr null_map{};
--- a/src/Interpreters/Set.h
+++ b/src/Interpreters/Set.h
@ -20,6 +20,7 @@ class Context;
 class IFunctionBase;
 using FunctionBasePtr = std::shared_ptr<IFunctionBase>;

+class Chunk;

 /** Data structure for implementation of IN expression.
  */
@ -45,11 +46,14 @@ public:
    void setHeader(const ColumnsWithTypeAndName & header);

    /// Returns false, if some limit was exceeded and no need to insert more data.
+    bool insertFromBlock(const Columns & columns);
    bool insertFromBlock(const ColumnsWithTypeAndName & columns);
+
    /// Call after all blocks were inserted. To get the information that set is already created.
    void finishInsert() { is_created = true; }

-    bool isCreated() const { return is_created; }
+    /// finishInsert and isCreated are thread-safe
+    bool isCreated() const { return is_created.load(); }

    /** For columns of 'block', check belonging of corresponding rows to the set.
      * Return UInt8 column with the result.
@ -111,7 +115,7 @@ private:
    bool transform_null_in;

    /// Check if set contains all the data.
-    bool is_created = false;
+    std::atomic<bool> is_created = false;

    /// If in the left part columns contains the same types as the elements of the set.
    void executeOrdinary(
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@ -73,16 +73,32 @@ public:
            return key_names_right.size();
        }

-        String formatDebug() const
+        String formatDebug(bool short_format = false) const
        {
-            return fmt::format("Left keys: [{}] Right keys [{}] Condition columns: '{}', '{}'",
-                               fmt::join(key_names_left, ", "), fmt::join(key_names_right, ", "),
-                               condColumnNames().first, condColumnNames().second);
+            const auto & [left_cond, right_cond] = condColumnNames();
+
+            if (short_format)
+            {
+                return fmt::format("({}) = ({}){}{}", fmt::join(key_names_left, ", "), fmt::join(key_names_right, ", "),
+                                   !left_cond.empty() ? " AND " + left_cond : "", !right_cond.empty() ? " AND " + right_cond : "");
+            }
+
+            return fmt::format(
+                "Left keys: [{}] Right keys [{}] Condition columns: '{}', '{}'",
+                 fmt::join(key_names_left, ", "), fmt::join(key_names_right, ", "), left_cond, right_cond);
        }
    };

    using Clauses = std::vector<JoinOnClause>;

+    static std::string formatClauses(const Clauses & clauses, bool short_format = false)
+    {
+        std::vector<std::string> res;
+        for (const auto & clause : clauses)
+            res.push_back("[" + clause.formatDebug(short_format) + "]");
+        return fmt::format("{}", fmt::join(res, "; "));
+    }
+
 private:
    /** Query of the form `SELECT expr(x) AS k FROM t1 ANY LEFT JOIN (SELECT expr(x) AS k FROM t2) USING k`
      * The join is made by column k.
--- a/src/Processors/PingPongProcessor.cpp
+++ b/src/Processors/PingPongProcessor.cpp
@ -0,0 +1,198 @@
+#include <Processors/PingPongProcessor.h>
+
+namespace DB
+{
+
+/// Create list with `num_ports` of regular ports and 1 auxiliary port with empty header.
+template <typename T> requires std::is_same_v<T, InputPorts> || std::is_same_v<T, OutputPorts>
+static T createPortsWithSpecial(const Block & header, size_t num_ports)
+{
+    T res(num_ports, header);
+    res.emplace_back(Block());
+    return res;
+}
+
+PingPongProcessor::PingPongProcessor(const Block & header, size_t num_ports, Order order_)
+    : IProcessor(createPortsWithSpecial<InputPorts>(header, num_ports),
+                 createPortsWithSpecial<OutputPorts>(header, num_ports))
+    , aux_in_port(inputs.back())
+    , aux_out_port(outputs.back())
+    , order(order_)
+{
+    assert(order == First || order == Second);
+
+    port_pairs.resize(num_ports);
+
+    auto input_it = inputs.begin();
+    auto output_it = outputs.begin();
+    for (size_t i = 0; i < num_ports; ++i)
+    {
+        port_pairs[i].input_port = &*input_it;
+        ++input_it;
+
+        port_pairs[i].output_port = &*output_it;
+        ++output_it;
+    }
+}
+
+void PingPongProcessor::finishPair(PortsPair & pair)
+{
+    if (!pair.is_finished)
+    {
+        pair.output_port->finish();
+        pair.input_port->close();
+
+        pair.is_finished = true;
+        ++num_finished_pairs;
+    }
+}
+
+bool PingPongProcessor::processPair(PortsPair & pair)
+{
+    if (pair.output_port->isFinished())
+    {
+        finishPair(pair);
+        return false;
+    }
+
+    if (pair.input_port->isFinished())
+    {
+        finishPair(pair);
+        return false;
+    }
+
+    if (!pair.output_port->canPush())
+    {
+        pair.input_port->setNotNeeded();
+        return false;
+    }
+
+    pair.input_port->setNeeded();
+    if (pair.input_port->hasData())
+    {
+        Chunk chunk = pair.input_port->pull(true);
+        ready_to_send |= consume(chunk);
+        pair.output_port->push(std::move(chunk));
+    }
+
+    return true;
+}
+
+bool PingPongProcessor::isPairsFinished() const
+{
+    return num_finished_pairs == port_pairs.size();
+}
+
+IProcessor::Status PingPongProcessor::processRegularPorts()
+{
+    if (isPairsFinished())
+        return Status::Finished;
+
+    bool need_data = false;
+
+    for (auto & pair : port_pairs)
+        need_data = processPair(pair) || need_data;
+
+    if (isPairsFinished())
+        return Status::Finished;
+
+    if (need_data)
+        return Status::NeedData;
+
+    return Status::PortFull;
+}
+
+bool PingPongProcessor::sendPing()
+{
+    if (aux_out_port.canPush())
+    {
+        Chunk chunk(aux_out_port.getHeader().cloneEmpty().getColumns(), 0);
+        aux_out_port.push(std::move(chunk));
+        is_send = true;
+        aux_out_port.finish();
+        return true;
+    }
+    return false;
+}
+
+bool PingPongProcessor::recievePing()
+{
+    if (aux_in_port.hasData())
+    {
+        aux_in_port.pull();
+        is_received = true;
+        aux_in_port.close();
+        return true;
+    }
+    return false;
+}
+
+bool PingPongProcessor::canSend() const
+{
+    return !is_send && (ready_to_send || isPairsFinished());
+}
+
+IProcessor::Status PingPongProcessor::prepare()
+{
+    if (!set_needed_once && !is_received && !aux_in_port.isFinished())
+    {
+        set_needed_once = true;
+        aux_in_port.setNeeded();
+    }
+
+    if (order == First || is_send)
+    {
+        if (!is_received)
+        {
+            bool received = recievePing();
+            if (!received)
+            {
+                return Status::NeedData;
+            }
+        }
+    }
+
+    if (order == Second || is_received)
+    {
+        if (!is_send && canSend())
+        {
+            bool sent = sendPing();
+            if (!sent)
+                return Status::PortFull;
+        }
+    }
+
+    auto status = processRegularPorts();
+    if (status == Status::Finished)
+    {
+        if (order == First || is_send)
+        {
+            if (!is_received)
+            {
+                bool received = recievePing();
+                if (!received)
+                {
+                    return Status::NeedData;
+                }
+            }
+        }
+
+        if (order == Second || is_received)
+        {
+            if (!is_send && canSend())
+            {
+                bool sent = sendPing();
+                if (!sent)
+                    return Status::PortFull;
+            }
+        }
+    }
+    return status;
+}
+
+std::pair<InputPort *, OutputPort *> PingPongProcessor::getAuxPorts()
+{
+    return std::make_pair(&aux_in_port, &aux_out_port);
+}
+
+}
--- a/src/Processors/PingPongProcessor.h
+++ b/src/Processors/PingPongProcessor.h
@ -0,0 +1,105 @@
+#pragma once
+
+#include <Processors/IProcessor.h>
+#include <base/unit.h>
+#include <Processors/Chunk.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+/*
+ * Processor with N inputs and N outputs. Moves data from i-th input to i-th output as is.
+ * It has a pair of auxiliary ports to notify another instance by sending empty chunk after some condition holds.
+ * You should use this processor in pair of instances and connect auxiliary ports crosswise.
+ *
+ *     ╭─┴───┴───┴───┴───┴─╮       ╭─┴───┴───┴───┴───┴─╮
+ *     │                   ├─ aux ⟶│                   │
+ *     │ PingPongProcessor │       │ PingPongProcessor │
+ *     │                   │⟵ aux ─┤                   │
+ *     ╰─┬───┬───┬───┬───┬─╯       ╰─┬───┬───┬───┬───┬─╯
+ *
+ * One of the processors starts processing data, and another waits for notification.
+ * When `consume` returns true, the first stops processing, sends a ping to another and waits for notification.
+ * After that, the second one also processes data until `consume`, then send a notification back to the first one.
+ * After this roundtrip, processors bypass data from regular inputs to outputs.
+ */
+class PingPongProcessor : public IProcessor
+{
+public:
+    enum class Order : uint8_t
+    {
+        /// Processor that starts processing data.
+        First,
+        /// Processor that waits for notification.
+        Second,
+    };
+
+    using enum Order;
+
+    PingPongProcessor(const Block & header, size_t num_ports, Order order_);
+
+    Status prepare() override;
+
+    std::pair<InputPort *, OutputPort *> getAuxPorts();
+
+    /// Returns `true` when enough data consumed
+    virtual bool consume(const Chunk & chunk) = 0;
+
+protected:
+    struct PortsPair
+    {
+        InputPort * input_port = nullptr;
+        OutputPort * output_port = nullptr;
+        bool is_finished = false;
+    };
+
+    bool sendPing();
+    bool recievePing();
+    bool canSend() const;
+
+    bool isPairsFinished() const;
+    bool processPair(PortsPair & pair);
+    void finishPair(PortsPair & pair);
+    Status processRegularPorts();
+
+    std::vector<PortsPair> port_pairs;
+    size_t num_finished_pairs = 0;
+
+    InputPort & aux_in_port;
+    OutputPort & aux_out_port;
+
+    bool is_send = false;
+    bool is_received = false;
+
+    bool ready_to_send = false;
+
+    /// Used to set 'needed' flag once for auxiliary input at first `prepare` call.
+    bool set_needed_once = false;
+
+    Order order;
+};
+
+/// Reads first N rows from two streams evenly.
+class ReadHeadBalancedProcessor : public PingPongProcessor
+{
+public:
+    ReadHeadBalancedProcessor(const Block & header, size_t num_ports, size_t size_to_wait_, Order order_)
+        : PingPongProcessor(header, num_ports, order_) , data_consumed(0) , size_to_wait(size_to_wait_)
+    {
+    }
+
+    String getName() const override { return "ReadHeadBalancedProcessor"; }
+
+    bool consume(const Chunk & chunk) override
+    {
+        data_consumed += chunk.getNumRows();
+        return data_consumed > size_to_wait;
+    }
+
+private:
+    size_t data_consumed;
+    size_t size_to_wait;
+};
+
+}
--- a/src/Processors/Port.cpp
+++ b/src/Processors/Port.cpp
@ -8,16 +8,16 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-void connect(OutputPort & output, InputPort & input)
+void connect(OutputPort & output, InputPort & input, bool reconnect)
 {
-    if (input.state)
+    if (!reconnect && input.state)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Port is already connected, (header: [{}])", input.header.dumpStructure());

-    if (output.state)
+    if (!reconnect && output.state)
        throw Exception(ErrorCodes::LOGICAL_ERROR, "Port is already connected, (header: [{}])", output.header.dumpStructure());

-    auto out_name = output.getProcessor().getName();
-    auto in_name = input.getProcessor().getName();
+    auto out_name = output.processor ? output.getProcessor().getName() : "null";
+    auto in_name = input.processor ? input.getProcessor().getName() : "null";

    assertCompatibleHeader(output.getHeader(), input.getHeader(), fmt::format("function connect between {} and {}", out_name, in_name));

--- a/src/Processors/Port.h
+++ b/src/Processors/Port.h
@ -25,7 +25,7 @@ namespace ErrorCodes

 class Port
 {
-    friend void connect(OutputPort &, InputPort &);
+    friend void connect(OutputPort &, InputPort &, bool);
    friend class IProcessor;

 public:
@ -267,7 +267,7 @@ protected:
 ///   * You can pull only if port hasData().
 class InputPort : public Port
 {
-    friend void connect(OutputPort &, InputPort &);
+    friend void connect(OutputPort &, InputPort &, bool);

 private:
    OutputPort * output_port = nullptr;
@ -390,7 +390,7 @@ public:
 ///   * You can push only if port doesn't hasData().
 class OutputPort : public Port
 {
-    friend void connect(OutputPort &, InputPort &);
+    friend void connect(OutputPort &, InputPort &, bool);

 private:
    InputPort * input_port = nullptr;
@ -483,6 +483,6 @@ using InputPorts = std::list<InputPort>;
 using OutputPorts = std::list<OutputPort>;


-void connect(OutputPort & output, InputPort & input);
+void connect(OutputPort & output, InputPort & input, bool reconnect = false);

 }
--- a/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.cpp
+++ b/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.cpp
@ -0,0 +1,205 @@
+#include <Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h>
+#include <Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h>
+
+#include <QueryPipeline/QueryPipelineBuilder.h>
+#include <IO/Operators.h>
+#include <Common/JSONBuilder.h>
+#include <Core/ColumnWithTypeAndName.h>
+#include <Core/ColumnsWithTypeAndName.h>
+#include <Processors/IProcessor.h>
+#include <Processors/PingPongProcessor.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+static void connectAllInputs(OutputPortRawPtrs ports, InputPorts & inputs, size_t num_ports)
+{
+    auto input_it = inputs.begin();
+    for (size_t i = 0; i < num_ports; ++i)
+    {
+        connect(*ports[i], *input_it);
+        input_it++;
+    }
+}
+
+static ColumnsWithTypeAndName getColumnSubset(const Block & block, const Names & column_names)
+{
+    ColumnsWithTypeAndName result;
+    for (const auto & name : column_names)
+        result.emplace_back(block.getByName(name));
+    return result;
+}
+
+static ITransformingStep::Traits getTraits()
+{
+    return ITransformingStep::Traits
+    {
+        {
+            .preserves_distinct_columns = true,
+            .returns_single_stream = false,
+            .preserves_number_of_streams = true,
+            .preserves_sorting = true,
+        },
+        {
+            .preserves_number_of_rows = false,
+        }
+    };
+}
+
+class CreateSetAndFilterOnTheFlyStep::CrosswiseConnection : public boost::noncopyable
+{
+public:
+    using PortPair = std::pair<InputPort *, OutputPort *>;
+
+    /// Remember ports passed on the first call and connect with ones from second call.
+    /// Thread-safe.
+    void connectPorts(PortPair rhs_ports, IProcessor * proc)
+    {
+        assert(!rhs_ports.first->isConnected() && !rhs_ports.second->isConnected());
+
+        std::lock_guard<std::mutex> lock(mux);
+        if (input_port || output_port)
+        {
+            assert(input_port && output_port);
+            assert(!input_port->isConnected());
+            connect(*rhs_ports.second, *input_port);
+            connect(*output_port, *rhs_ports.first, /* reconnect= */ true);
+        }
+        else
+        {
+            std::tie(input_port, output_port) = rhs_ports;
+            assert(input_port && output_port);
+            assert(!input_port->isConnected() && !output_port->isConnected());
+
+            dummy_input_port = std::make_unique<InputPort>(output_port->getHeader(), proc);
+            connect(*output_port, *dummy_input_port);
+        }
+    }
+
+private:
+    std::mutex mux;
+    InputPort * input_port = nullptr;
+    OutputPort * output_port = nullptr;
+
+    /// Output ports should always be connected, and we can't add a step to the pipeline without them.
+    /// So, connect the port from the first processor to this dummy port and then reconnect to the second processor.
+    std::unique_ptr<InputPort> dummy_input_port;
+};
+
+CreateSetAndFilterOnTheFlyStep::CrosswiseConnectionPtr CreateSetAndFilterOnTheFlyStep::createCrossConnection()
+{
+    return std::make_shared<CreateSetAndFilterOnTheFlyStep::CrosswiseConnection>();
+}
+
+CreateSetAndFilterOnTheFlyStep::CreateSetAndFilterOnTheFlyStep(
+    const DataStream & input_stream_,
+    const Names & column_names_,
+    size_t max_rows_in_set_,
+    CrosswiseConnectionPtr crosswise_connection_,
+    JoinTableSide position_)
+    : ITransformingStep(input_stream_, input_stream_.header, getTraits())
+    , column_names(column_names_)
+    , max_rows_in_set(max_rows_in_set_)
+    , own_set(std::make_shared<SetWithState>(SizeLimits(max_rows_in_set, 0, OverflowMode::BREAK), false, true))
+    , filtering_set(nullptr)
+    , crosswise_connection(crosswise_connection_)
+    , position(position_)
+{
+    if (crosswise_connection == nullptr)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Crosswise connection is not initialized");
+
+    if (input_streams.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Step requires exactly one input stream, got {}", input_streams.size());
+
+    own_set->setHeader(getColumnSubset(input_streams[0].header, column_names));
+}
+
+void CreateSetAndFilterOnTheFlyStep::transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings &)
+{
+    size_t num_streams = pipeline.getNumStreams();
+    pipeline.addSimpleTransform([this, num_streams](const Block & header, QueryPipelineBuilder::StreamType stream_type) -> ProcessorPtr
+    {
+        if (stream_type != QueryPipelineBuilder::StreamType::Main)
+            return nullptr;
+        auto res = std::make_shared<CreatingSetsOnTheFlyTransform>(header, column_names, num_streams, own_set);
+        res->setDescription(this->getStepDescription());
+        return res;
+    });
+
+    Block input_header = pipeline.getHeader();
+    auto pipeline_transform = [&input_header, this](OutputPortRawPtrs ports)
+    {
+        Processors result_transforms;
+
+        size_t num_ports = ports.size();
+
+        /// Add balancing transform
+        auto idx = position == JoinTableSide::Left ? PingPongProcessor::First : PingPongProcessor::Second;
+        auto stream_balancer = std::make_shared<ReadHeadBalancedProcessor>(input_header, num_ports, max_rows_in_set, idx);
+        stream_balancer->setDescription(getStepDescription());
+
+        /// Regular inputs just bypass data for respective ports
+        connectAllInputs(ports, stream_balancer->getInputs(), num_ports);
+
+        /// Connect auxiliary ports
+        crosswise_connection->connectPorts(stream_balancer->getAuxPorts(), stream_balancer.get());
+
+        if (!filtering_set)
+        {
+            LOG_DEBUG(log, "Skip filtering {} stream", position);
+            result_transforms.emplace_back(std::move(stream_balancer));
+            return result_transforms;
+        }
+
+        /// Add filtering transform, ports just connected respectively
+        auto & outputs = stream_balancer->getOutputs();
+        auto output_it = outputs.begin();
+        for (size_t i = 0; i < outputs.size() - 1; ++i)
+        {
+            auto & port = *output_it++;
+            auto transform = std::make_shared<FilterBySetOnTheFlyTransform>(port.getHeader(), column_names, filtering_set);
+            transform->setDescription(this->getStepDescription());
+            connect(port, transform->getInputPort());
+            result_transforms.emplace_back(std::move(transform));
+        }
+        assert(output_it == std::prev(outputs.end()));
+        result_transforms.emplace_back(std::move(stream_balancer));
+
+        return result_transforms;
+    };
+
+    /// Auxiliary port stream_balancer can be connected later (by crosswise_connection).
+    /// So, use unsafe `transform` with `check_ports = false` to avoid assertions
+    pipeline.transform(std::move(pipeline_transform), /* check_ports= */ false);
+}
+
+void CreateSetAndFilterOnTheFlyStep::describeActions(JSONBuilder::JSONMap & map) const
+{
+    map.add(getName(), true);
+}
+
+void CreateSetAndFilterOnTheFlyStep::describeActions(FormatSettings & settings) const
+{
+    String prefix(settings.offset, ' ');
+    settings.out << prefix << getName();
+
+    settings.out << '\n';
+}
+
+void CreateSetAndFilterOnTheFlyStep::updateOutputStream()
+{
+    if (input_streams.size() != 1)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "{} requires exactly one input stream, got {}", getName(), input_streams.size());
+
+    own_set->setHeader(getColumnSubset(input_streams[0].header, column_names));
+
+    output_stream = input_streams[0];
+}
+
+
+}
--- a/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h
+++ b/src/Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h
@ -0,0 +1,59 @@
+#pragma once
+#include <Processors/QueryPlan/ITransformingStep.h>
+#include <Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h>
+#include <Processors/DelayedPortsProcessor.h>
+
+
+namespace DB
+{
+
+/*
+ * Used to optimize JOIN when joining a small table over a large table.
+ * Currently applied only for the full sorting join.
+ * It tries to build a set for each stream.
+ * Once one stream is finished, it starts to filter another stream with this set.
+ */
+class CreateSetAndFilterOnTheFlyStep : public ITransformingStep
+{
+public:
+    /// Two instances of step need some shared state to connect processors crosswise
+    class CrosswiseConnection;
+    using CrosswiseConnectionPtr = std::shared_ptr<CrosswiseConnection>;
+    static CrosswiseConnectionPtr createCrossConnection();
+
+    CreateSetAndFilterOnTheFlyStep(
+        const DataStream & input_stream_,
+        const Names & column_names_,
+        size_t max_rows_in_set_,
+        CrosswiseConnectionPtr crosswise_connection_,
+        JoinTableSide position_);
+
+    String getName() const override { return "CreateSetAndFilterOnTheFlyStep"; }
+    void transformPipeline(QueryPipelineBuilder & pipeline, const BuildQueryPipelineSettings & settings) override;
+
+    void describeActions(JSONBuilder::JSONMap & map) const override;
+    void describeActions(FormatSettings & settings) const override;
+
+    SetWithStatePtr getSet() const { return own_set; }
+
+    /// Set for another stream.
+    void setFiltering(SetWithStatePtr filtering_set_) { filtering_set = filtering_set_; }
+
+private:
+    void updateOutputStream() override;
+
+    Names column_names;
+
+    size_t max_rows_in_set;
+
+    SetWithStatePtr own_set;
+    SetWithStatePtr filtering_set;
+
+    CrosswiseConnectionPtr crosswise_connection;
+
+    JoinTableSide position;
+
+    Poco::Logger * log = &Poco::Logger::get("CreateSetAndFilterOnTheFlyStep");
+};
+
+}
--- a/src/Processors/QueryPlan/JoinStep.cpp
+++ b/src/Processors/QueryPlan/JoinStep.cpp
@ -34,8 +34,12 @@ QueryPipelineBuilderPtr JoinStep::updatePipeline(QueryPipelineBuilders pipelines
        throw Exception(ErrorCodes::LOGICAL_ERROR, "JoinStep expect two input steps");

    if (join->pipelineType() == JoinPipelineType::YShaped)
-        return QueryPipelineBuilder::joinPipelinesYShaped(
+    {
+        auto joined_pipeline = QueryPipelineBuilder::joinPipelinesYShaped(
            std::move(pipelines[0]), std::move(pipelines[1]), join, output_stream->header, max_block_size, &processors);
+        joined_pipeline->resize(max_streams);
+        return joined_pipeline;
+    }

    return QueryPipelineBuilder::joinPipelinesRightLeft(
        std::move(pipelines[0]),
--- a/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp
+++ b/src/Processors/QueryPlan/Optimizations/filterPushDown.cpp
@ -8,6 +8,7 @@
 #include <Processors/QueryPlan/Optimizations/Optimizations.h>
 #include <Processors/QueryPlan/ITransformingStep.h>
 #include <Processors/QueryPlan/FilterStep.h>
+#include <Processors/QueryPlan/CreateSetAndFilterOnTheFlyStep.h>
 #include <Processors/QueryPlan/AggregatingStep.h>
 #include <Processors/QueryPlan/ExpressionStep.h>
 #include <Processors/QueryPlan/JoinStep.h>
@ -22,6 +23,7 @@
 #include <Interpreters/ActionsDAG.h>
 #include <Interpreters/ArrayJoinAction.h>
 #include <Interpreters/TableJoin.h>
+#include <fmt/format.h>

 namespace DB::ErrorCodes
 {
@ -134,10 +136,24 @@ tryAddNewFilterStep(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, con

 static size_t
 tryAddNewFilterStep(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, const Names & allowed_inputs,
-                    bool can_remove_filter = true)
+                    bool can_remove_filter = true, size_t child_idx = 0)
 {
-    if (auto split_filter = splitFilter(parent_node, allowed_inputs, 0))
-        return tryAddNewFilterStep(parent_node, nodes, split_filter, can_remove_filter, 0);
+    if (auto split_filter = splitFilter(parent_node, allowed_inputs, child_idx))
+        return tryAddNewFilterStep(parent_node, nodes, split_filter, can_remove_filter, child_idx);
+    return 0;
+}
+
+
+/// Push down filter through specified type of step
+template <typename Step>
+static size_t simplePushDownOverStep(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes, QueryPlanStepPtr & child)
+{
+    if (typeid_cast<Step *>(child.get()))
+    {
+        Names allowed_inputs = child->getOutputStream().header.getNames();
+        if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs))
+            return updated_steps;
+    }
    return 0;
 }

@ -234,12 +250,8 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes
            return updated_steps;
    }

-    if (auto * distinct = typeid_cast<DistinctStep *>(child.get()))
-    {
-        Names allowed_inputs = distinct->getOutputStream().header.getNames();
-        if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs))
+    if (auto updated_steps = simplePushDownOverStep<DistinctStep>(parent_node, nodes, child))
        return updated_steps;
-    }

    if (auto * join = typeid_cast<JoinStep *>(child.get()))
    {
@ -290,7 +302,7 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes
            const size_t updated_steps = tryAddNewFilterStep(parent_node, nodes, split_filter, can_remove_filter, child_idx);
            if (updated_steps > 0)
            {
-                LOG_DEBUG(&Poco::Logger::get("QueryPlanOptimizations"), "Pushed down filter to {} side of join", kind);
+                LOG_DEBUG(&Poco::Logger::get("QueryPlanOptimizations"), "Pushed down filter {} to the {} side of join", split_filter_column_name, kind);
            }
            return updated_steps;
        };
@ -321,12 +333,11 @@ size_t tryPushDownFilter(QueryPlan::Node * parent_node, QueryPlan::Nodes & nodes
    // {
    // }

-    if (typeid_cast<SortingStep *>(child.get()))
-    {
-        Names allowed_inputs = child->getOutputStream().header.getNames();
-        if (auto updated_steps = tryAddNewFilterStep(parent_node, nodes, allowed_inputs))
+    if (auto updated_steps = simplePushDownOverStep<SortingStep>(parent_node, nodes, child))
+        return updated_steps;
+
+    if (auto updated_steps = simplePushDownOverStep<CreateSetAndFilterOnTheFlyStep>(parent_node, nodes, child))
        return updated_steps;
-    }

    if (auto * union_step = typeid_cast<UnionStep *>(child.get()))
    {
--- a/src/Processors/ResizeProcessor.h
+++ b/src/Processors/ResizeProcessor.h
@ -85,6 +85,13 @@ public:
    {
    }

+    StrictResizeProcessor(InputPorts inputs_, OutputPorts outputs_)
+        : IProcessor(inputs_, outputs_)
+        , current_input(inputs.begin())
+        , current_output(outputs.begin())
+    {
+    }
+
    String getName() const override { return "StrictResize"; }

    Status prepare(const PortNumbers &, const PortNumbers &) override;
--- a/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.cpp
+++ b/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.cpp
@ -0,0 +1,195 @@
+#include <Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h>
+
+#include <cstddef>
+#include <mutex>
+
+#include <Interpreters/Set.h>
+#include <Common/Stopwatch.h>
+#include <Common/formatReadable.h>
+#include <Common/logger_useful.h>
+#include <Columns/IColumn.h>
+#include <Core/ColumnWithTypeAndName.h>
+#include <base/types.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+
+std::vector<size_t> getColumnIndices(const Block & block, const Names & column_names)
+{
+    std::vector<size_t> indices;
+    for (const auto & name : column_names)
+        indices.push_back(block.getPositionByName(name));
+    return indices;
+}
+
+Columns getColumnsByIndices(const Chunk & chunk, const std::vector<size_t> & indices)
+{
+    Columns columns;
+    const Columns & all_cols = chunk.getColumns();
+    for (const auto & index : indices)
+        columns.push_back(all_cols.at(index));
+    return columns;
+}
+
+ColumnsWithTypeAndName getColumnsByIndices(const Block & sample_block, const Chunk & chunk, const std::vector<size_t> & indices)
+{
+    Block block = sample_block.cloneEmpty();
+    block.setColumns(getColumnsByIndices(chunk, indices));
+    return block.getColumnsWithTypeAndName();
+}
+
+}
+
+CreatingSetsOnTheFlyTransform::CreatingSetsOnTheFlyTransform(
+    const Block & header_, const Names & column_names_, size_t num_streams_, SetWithStatePtr set_)
+    : ISimpleTransform(header_, header_, true)
+    , column_names(column_names_)
+    , key_column_indices(getColumnIndices(inputs.front().getHeader(), column_names))
+    , num_streams(num_streams_)
+    , set(set_)
+{
+}
+
+IProcessor::Status CreatingSetsOnTheFlyTransform::prepare()
+{
+    IProcessor::Status status = ISimpleTransform::prepare();
+
+    if (!set || status != Status::Finished)
+        /// Nothing to do with set
+        return status;
+
+    /// Finalize set
+    if (set->state == SetWithState::State::Creating)
+    {
+        if (input.isFinished())
+        {
+            set->finished_count++;
+            if (set->finished_count != num_streams)
+                /// Not all instances of processor are finished
+                return status;
+
+            set->finishInsert();
+            set->state = SetWithState::State::Finished;
+            LOG_DEBUG(log, "{}: finish building set for [{}] with {} rows, set size is {}",
+                getDescription(), fmt::join(column_names, ", "), set->getTotalRowCount(),
+                formatReadableSizeWithBinarySuffix(set->getTotalByteCount()));
+            set.reset();
+        }
+        else
+        {
+            /// Should not happen because processor inserted before join that reads all the data
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "Processor finished, but not all input was read");
+        }
+    }
+
+    return status;
+}
+
+void CreatingSetsOnTheFlyTransform::transform(Chunk & chunk)
+{
+    if (!set || set->state != SetWithState::State::Creating)
+    {
+        /// If set building suspended by another processor, release pointer
+        if (set != nullptr)
+            set.reset();
+        return;
+    }
+
+    if (chunk.getNumRows())
+    {
+        Columns key_columns = getColumnsByIndices(chunk, key_column_indices);
+        bool limit_exceeded = !set->insertFromBlock(key_columns);
+        if (limit_exceeded)
+        {
+            auto prev_state = set->state.exchange(SetWithState::State::Suspended);
+            /// Print log only after first state switch
+            if (prev_state == SetWithState::State::Creating)
+            {
+                LOG_DEBUG(log, "{}: set limit exceeded, give up building set, after reading {} rows and using {}",
+                    getDescription(), set->getTotalRowCount(), formatReadableSizeWithBinarySuffix(set->getTotalByteCount()));
+            }
+            /// Probaply we need to clear set here, because it's unneeded anymore
+            /// But now `Set` doesn't have such method, so reset pointer in all processors and then it should be freed
+            set.reset();
+        }
+    }
+}
+
+FilterBySetOnTheFlyTransform::FilterBySetOnTheFlyTransform(const Block & header_, const Names & column_names_, SetWithStatePtr set_)
+    : ISimpleTransform(header_, header_, true)
+    , column_names(column_names_)
+    , key_column_indices(getColumnIndices(inputs.front().getHeader(), column_names))
+    , set(set_)
+{
+    const auto & header = inputs.front().getHeader();
+    for (size_t idx : key_column_indices)
+        key_sample_block.insert(header.getByPosition(idx));
+}
+
+IProcessor::Status FilterBySetOnTheFlyTransform::prepare()
+{
+    auto status = ISimpleTransform::prepare();
+
+    if (set && set->state == SetWithState::State::Suspended)
+        set.reset();
+
+    if (status == Status::Finished)
+    {
+        bool has_filter = set && set->state == SetWithState::State::Finished;
+        if (has_filter)
+        {
+            LOG_DEBUG(log, "Finished {} by [{}]: consumed {} rows in total, {} rows bypassed, result {} rows, {:.2f}% filtered",
+                Poco::toLower(getDescription()), fmt::join(column_names, ", "),
+                stat.consumed_rows, stat.consumed_rows_before_set, stat.result_rows,
+                100 - 100.0 * stat.result_rows / stat.consumed_rows);
+        }
+        else
+        {
+            LOG_DEBUG(log, "Finished {}: bypass {} rows", Poco::toLower(getDescription()), stat.consumed_rows);
+        }
+
+        /// Release set to free memory
+        set = nullptr;
+    }
+    return status;
+}
+
+void FilterBySetOnTheFlyTransform::transform(Chunk & chunk)
+{
+    stat.consumed_rows += chunk.getNumRows();
+    stat.result_rows += chunk.getNumRows();
+
+    bool can_filter = set && set->state == SetWithState::State::Finished;
+    if (!can_filter)
+        stat.consumed_rows_before_set += chunk.getNumRows();
+
+    if (can_filter && chunk.getNumRows())
+    {
+        auto key_columns = getColumnsByIndices(key_sample_block, chunk, key_column_indices);
+        ColumnPtr mask_col = set->execute(key_columns, false);
+        const auto & mask = assert_cast<const ColumnUInt8 *>(mask_col.get())->getData();
+
+        stat.result_rows -= chunk.getNumRows();
+
+        Columns columns = chunk.detachColumns();
+        size_t result_num_rows = 0;
+        for (auto & col : columns)
+        {
+            col = col->filter(mask, /* negative */ false);
+            result_num_rows = col->size();
+        }
+        stat.result_rows += result_num_rows;
+
+        chunk.setColumns(std::move(columns), result_num_rows);
+    }
+}
+
+}
--- a/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h
+++ b/src/Processors/Transforms/CreateSetAndFilterOnTheFlyTransform.h
@ -0,0 +1,114 @@
+#pragma once
+
+#include <atomic>
+#include <mutex>
+#include <vector>
+#include <Processors/ISimpleTransform.h>
+#include <Poco/Logger.h>
+#include <Interpreters/Set.h>
+
+namespace DB
+{
+
+struct SetWithState : public Set
+{
+    using Set::Set;
+
+    /// Flow: Creating -> Finished or Suspended
+    enum class State
+    {
+        /// Set is not yet created,
+        /// Creating processor continues to build set.
+        /// Filtering bypasses data.
+        Creating,
+
+        /// Set is finished.
+        /// Creating processor is finished.
+        /// Filtering filter stream with this set.
+        Finished,
+
+        /// Set building is canceled (due to limit exceeded).
+        /// Creating and filtering processors bypass data.
+        Suspended,
+    };
+
+    std::atomic<State> state = State::Creating;
+
+    /// Track number of processors that are currently working on this set.
+    /// Last one finalizes set.
+    std::atomic_size_t finished_count = 0;
+};
+
+using SetWithStatePtr = std::shared_ptr<SetWithState>;
+
+/*
+ * Create a set on the fly for incoming stream.
+ * The set is created from the key columns of the input block.
+ * Data is not changed and returned as is.
+ * Can be executed in parallel, but blocks on operations with set.
+ */
+class CreatingSetsOnTheFlyTransform : public ISimpleTransform
+{
+public:
+    CreatingSetsOnTheFlyTransform(const Block & header_, const Names & column_names_, size_t num_streams_, SetWithStatePtr set_);
+
+    String getName() const override { return "CreatingSetsOnTheFlyTransform"; }
+
+    Status prepare() override;
+
+    void transform(Chunk & chunk) override;
+
+private:
+    Names column_names;
+    std::vector<size_t> key_column_indices;
+
+    size_t num_streams;
+
+    /// Set to fill
+    SetWithStatePtr set;
+
+    Poco::Logger * log = &Poco::Logger::get("CreatingSetsOnTheFlyTransform");
+};
+
+/*
+ * Filter the input chunk by the set.
+ * When set building is not completed, just return the source data.
+ */
+class FilterBySetOnTheFlyTransform : public ISimpleTransform
+{
+public:
+    FilterBySetOnTheFlyTransform(const Block & header_, const Names & column_names_, SetWithStatePtr set_);
+
+    String getName() const override { return "FilterBySetOnTheFlyTransform"; }
+
+    Status prepare() override;
+
+    void transform(Chunk & chunk) override;
+
+private:
+    /// Set::execute requires ColumnsWithTypesAndNames, so we need to convert Chunk to that format
+    Block key_sample_block;
+
+    Names column_names;
+    std::vector<size_t> key_column_indices;
+
+    /// Filter by this set when it's created
+    SetWithStatePtr set;
+
+    /// Statistics to log
+    struct Stat
+    {
+        /// Total number of rows
+        size_t consumed_rows = 0;
+
+        /// Number of bypassed rows (processed before set is created)
+        size_t consumed_rows_before_set = 0;
+
+        /// Number of rows that passed the filter
+        size_t result_rows = 0;
+    } stat;
+
+    Poco::Logger * log = &Poco::Logger::get("FilterBySetOnTheFlyTransform");
+};
+
+}
--- a/src/QueryPipeline/Pipe.cpp
+++ b/src/QueryPipeline/Pipe.cpp
@ -770,7 +770,7 @@ void Pipe::setSinks(const Pipe::ProcessorGetterWithStreamKind & getter)
    header.clear();
 }

-void Pipe::transform(const Transformer & transformer)
+void Pipe::transform(const Transformer & transformer, bool check_ports)
 {
    if (output_ports.empty())
        throw Exception("Cannot transform empty Pipe", ErrorCodes::LOGICAL_ERROR);
@ -784,6 +784,9 @@ void Pipe::transform(const Transformer & transformer)

    for (const auto & port : output_ports)
    {
+        if (!check_ports)
+            break;
+
        if (!port->isConnected())
            throw Exception(
                ErrorCodes::LOGICAL_ERROR,
@ -799,6 +802,9 @@ void Pipe::transform(const Transformer & transformer)
    {
        for (const auto & port : processor->getInputs())
        {
+            if (!check_ports)
+                break;
+
            if (!port.isConnected())
                throw Exception(
                    ErrorCodes::LOGICAL_ERROR,
@ -806,7 +812,7 @@ void Pipe::transform(const Transformer & transformer)
                    processor->getName());

            const auto * connected_processor = &port.getOutputPort().getProcessor();
-            if (!set.contains(connected_processor))
+            if (check_ports && !set.contains(connected_processor))
                throw Exception(
                    ErrorCodes::LOGICAL_ERROR,
                    "Transformation of Pipe is not valid because processor {} has input port which is connected with unknown processor {}",
@ -823,7 +829,7 @@ void Pipe::transform(const Transformer & transformer)
            }

            const auto * connected_processor = &port.getInputPort().getProcessor();
-            if (!set.contains(connected_processor))
+            if (check_ports && !set.contains(connected_processor))
                throw Exception(
                    ErrorCodes::LOGICAL_ERROR,
                    "Transformation of Pipe is not valid because processor {} has output port which is connected with unknown processor {}",
--- a/src/QueryPipeline/Pipe.h
+++ b/src/QueryPipeline/Pipe.h
@ -85,13 +85,13 @@ public:
    /// Add chain to every output port.
    void addChains(std::vector<Chain> chains);

-    /// Changes the number of output ports if needed. Adds ResizeTransform.
+    /// Changes the number of output ports if needed. Adds (Strict)ResizeProcessor.
    void resize(size_t num_streams, bool force = false, bool strict = false);

    using Transformer = std::function<Processors(OutputPortRawPtrs ports)>;

    /// Transform Pipe in general way.
-    void transform(const Transformer & transformer);
+    void transform(const Transformer & transformer, bool check_ports = true);

    /// Unite several pipes together. They should have same header.
    static Pipe unitePipes(Pipes pipes);
--- a/src/QueryPipeline/QueryPipelineBuilder.cpp
+++ b/src/QueryPipeline/QueryPipelineBuilder.cpp
@ -159,10 +159,10 @@ void QueryPipelineBuilder::addChain(Chain chain)
    pipe.addChains(std::move(chains));
 }

-void QueryPipelineBuilder::transform(const Transformer & transformer)
+void QueryPipelineBuilder::transform(const Transformer & transformer, bool check_ports)
 {
    checkInitializedAndNotCompleted();
-    pipe.transform(transformer);
+    pipe.transform(transformer, check_ports);
 }

 void QueryPipelineBuilder::setSinks(const Pipe::ProcessorGetterWithStreamKind & getter)
@ -348,8 +348,7 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesYShaped

    left->pipe.dropExtremes();
    right->pipe.dropExtremes();
-
-    if (left->pipe.output_ports.size() != 1 || right->pipe.output_ports.size() != 1)
+    if (left->getNumStreams() != 1 || right->getNumStreams() != 1)
        throw Exception("Join is supported only for pipelines with one output port", ErrorCodes::LOGICAL_ERROR);

    if (left->hasTotals() || right->hasTotals())
@ -359,8 +358,7 @@ std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesYShaped

    auto joining = std::make_shared<MergeJoinTransform>(join, inputs, out_header, max_block_size);

-    auto result = mergePipelines(std::move(left), std::move(right), std::move(joining), collected_processors);
-    return result;
+    return mergePipelines(std::move(left), std::move(right), std::move(joining), collected_processors);
 }

 std::unique_ptr<QueryPipelineBuilder> QueryPipelineBuilder::joinPipelinesRightLeft(
--- a/src/QueryPipeline/QueryPipelineBuilder.h
+++ b/src/QueryPipeline/QueryPipelineBuilder.h
@ -69,7 +69,7 @@ public:

    using Transformer = std::function<Processors(OutputPortRawPtrs ports)>;
    /// Transform pipeline in general way.
-    void transform(const Transformer & transformer);
+    void transform(const Transformer & transformer, bool check_ports = true);

    /// Add TotalsHavingTransform. Resize pipeline to single input. Adds totals port.
    void addTotalsHavingTransform(ProcessorPtr transform);
--- a/tests/performance/join_set_filter.xml
+++ b/tests/performance/join_set_filter.xml
@ -0,0 +1,45 @@
+<test>
+    <substitutions>
+       <substitution>
+           <name>table_size</name>
+           <values>
+               <value>100000000</value>
+           </values>
+       </substitution>
+    </substitutions>
+
+    <settings>
+        <join_algorithm>full_sorting_merge</join_algorithm>
+    </settings>
+
+    <create_query>
+        CREATE TABLE t1 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+        AS SELECT
+            sipHash64(number, 't1_x') % {table_size} AS x,
+            sipHash64(number, 't1_y') % {table_size} AS y
+        FROM numbers({table_size})
+    </create_query>
+
+    <create_query>
+        CREATE TABLE t2 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+        AS SELECT
+            sipHash64(number, 't2_x') % {table_size} AS x,
+            sipHash64(number, 't2_y') % {table_size} AS y
+        FROM numbers({table_size})
+    </create_query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE less(t1.y, 10000)</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE less(t1.y, 10000)</query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE greater(t1.y, {table_size} - 10000)</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE greater(t1.y, {table_size} - 10000)</query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 100 = 0</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE t1.y % 100 = 0</query>
+
+    <query>SELECT * FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 1000 = 0</query>
+    <query>SELECT * FROM t2 JOIN t1 ON t1.x = t2.x WHERE t1.y % 1000 = 0</query>
+
+    <drop_query>DROP TABLE IF EXISTS t1</drop_query>
+    <drop_query>DROP TABLE IF EXISTS t2</drop_query>
+</test>
--- a/tests/queries/0_stateless/02381_join_dup_columns_in_plan.sql
+++ b/tests/queries/0_stateless/02381_join_dup_columns_in_plan.sql
@ -7,6 +7,8 @@ USING (key);

 SET join_algorithm = 'full_sorting_merge';

+SET max_rows_in_set_to_optimize_join = 0;
+
 EXPLAIN actions=0, description=0, header=1
 SELECT * FROM ( SELECT 'key2' AS key ) AS s1
 JOIN ( SELECT 'key1' AS key, '1' AS value UNION ALL SELECT 'key2' AS key, '1' AS value ) AS s2
--- a/tests/queries/0_stateless/02382_join_and_filtering_set.reference
+++ b/tests/queries/0_stateless/02382_join_and_filtering_set.reference
@ -0,0 +1,7 @@
+106
+46
+42
+51
+42
+24
+10
--- a/tests/queries/0_stateless/02382_join_and_filtering_set.sql
+++ b/tests/queries/0_stateless/02382_join_and_filtering_set.sql
@ -0,0 +1,20 @@
+DROP TABLE IF EXISTS t1;
+DROP TABLE IF EXISTS t2;
+
+CREATE TABLE t1 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't1_x') % 100 AS x, sipHash64(number, 't1_y') % 100 AS y FROM numbers(100);
+
+CREATE TABLE t2 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't2_x') % 100 AS x, sipHash64(number, 't2_y') % 100 AS y FROM numbers(100);
+
+SET max_rows_in_set_to_optimize_join = 1000;
+SET join_algorithm = 'full_sorting_merge';
+
+-- different combinations of conditions on key/attribute columns for the left/right tables
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.x % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t2.y % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t2.x % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.y % 2 == 0 AND t2.y % 2 == 0;
+SELECT count() FROM t1 JOIN t2 ON t1.x = t2.x WHERE t1.x % 2 == 0 AND t2.x % 2 == 0 AND t1.y % 2 == 0 AND t2.y % 2 == 0;
--- a/tests/queries/0_stateless/02383_join_and_filtering_set.reference
+++ b/tests/queries/0_stateless/02383_join_and_filtering_set.reference
@ -0,0 +1,10 @@
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
+Ok
--- a/tests/queries/0_stateless/02383_join_and_filtering_set.sh
+++ b/tests/queries/0_stateless/02383_join_and_filtering_set.sh
@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+# Tags: no-asan,no-msan,no-tsan,no-ubsan
+#
+# Test doesn't run complex queries, just test the logic of setting, so no need to run with different builds.
+# Also, we run similar queries in 02382_join_and_filtering_set.sql which is enabled for these builds.
+#
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+$CLICKHOUSE_CLIENT -mn -q """
+CREATE TABLE t1 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't1_x') % 100 AS x, sipHash64(number, 't1_y') % 100 AS y FROM numbers(100);
+
+CREATE TABLE t2 (x UInt64, y UInt64) ENGINE = MergeTree ORDER BY y
+AS SELECT sipHash64(number, 't2_x') % 100 AS x, sipHash64(number, 't2_y') % 100 AS y FROM numbers(100);
+"""
+
+# Arguments:
+# - value of max_rows_in_set_to_optimize_join
+# - join kind
+# - expected number of steps in plan
+# - expected number of steps in pipeline
+function test() {
+
+PARAM_VALUE=$1
+JOIN_KIND=${2:-}
+
+EXPECTED_PLAN_STEPS=$3
+RES=$(
+    $CLICKHOUSE_CLIENT --max_rows_in_set_to_optimize_join=${PARAM_VALUE} --join_algorithm='full_sorting_merge' \
+                       -q "EXPLAIN PLAN SELECT count() FROM t1 ${JOIN_KIND} JOIN t2 ON t1.x = t2.x" | grep -o 'CreateSetAndFilterOnTheFlyStep' | wc -l
+)
+[ "$RES" -eq "$EXPECTED_PLAN_STEPS" ] && echo "Ok" || echo "Fail: $RES != $EXPECTED_PLAN_STEPS"
+
+EXPECTED_PIPELINE_STEPS=$4
+RES=$(
+    $CLICKHOUSE_CLIENT --max_rows_in_set_to_optimize_join=${PARAM_VALUE} --join_algorithm='full_sorting_merge' \
+                       -q "EXPLAIN PIPELINE SELECT count() FROM t1 ${JOIN_KIND} JOIN t2 ON t1.x = t2.x" \
+                       | grep -o -e ReadHeadBalancedProcessor -e FilterBySetOnTheFlyTransform -e CreatingSetsOnTheFlyTransform | wc -l
+)
+[ "$RES" -eq "$EXPECTED_PIPELINE_STEPS" ] && echo "Ok" || echo "Fail: $RES != $EXPECTED_PIPELINE_STEPS"
+
+}
+
+test 1000 '' 2 6
+
+# no filtering for left/right side
+test 1000 'LEFT' 2 5
+test 1000 'RIGHT' 2 5
+
+# when disabled no extra steps should be created
+test 1000 'FULL' 0 0
+test 0 '' 0 0