2022-04-19 08:07:30 +00:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include <condition_variable>
|
|
|
|
#include <memory>
|
|
|
|
#include <optional>
|
2022-04-19 08:53:24 +00:00
|
|
|
#include <Core/BackgroundSchedulePool.h>
|
|
|
|
#include <Functions/FunctionsLogical.h>
|
|
|
|
#include <Interpreters/Context.h>
|
|
|
|
#include <Interpreters/ExpressionActions.h>
|
2022-04-19 08:07:30 +00:00
|
|
|
#include <Interpreters/HashJoin.h>
|
2022-04-19 08:53:24 +00:00
|
|
|
#include <Interpreters/IJoin.h>
|
|
|
|
#include <base/defines.h>
|
|
|
|
#include <base/types.h>
|
2022-04-19 08:07:30 +00:00
|
|
|
#include <Common/Stopwatch.h>
|
2022-04-20 11:47:16 +00:00
|
|
|
|
2022-04-19 08:07:30 +00:00
|
|
|
namespace DB
|
|
|
|
{
|
2022-05-06 15:17:46 +00:00
|
|
|
|
2022-04-19 08:07:30 +00:00
|
|
|
/**
|
|
|
|
* Can run addJoinedBlock() parallelly to speedup the join process. On test, it almose linear speedup by
|
|
|
|
* the degree of parallelism.
|
|
|
|
*
|
|
|
|
* The default HashJoin is not thread safe for inserting right table's rows and run it in a single thread. When
|
|
|
|
* the right table is large, the join process is too slow.
|
|
|
|
*
|
|
|
|
* We create multiple HashJoin instances here. In addJoinedBlock(), one input block is split into multiple blocks
|
|
|
|
* corresponding to the HashJoin instances by hashing every row on the join keys. And make a guarantee that every HashJoin
|
|
|
|
* instance is written by only one thread.
|
|
|
|
*
|
|
|
|
* When come to the left table matching, the blocks from left table are alse split into different HashJoin instances.
|
|
|
|
*
|
|
|
|
*/
|
|
|
|
class ConcurrentHashJoin : public IJoin
|
|
|
|
{
|
2022-05-06 15:17:46 +00:00
|
|
|
|
2022-04-19 08:07:30 +00:00
|
|
|
public:
|
2022-05-05 04:04:11 +00:00
|
|
|
explicit ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_ = false);
|
2022-04-19 08:07:30 +00:00
|
|
|
~ConcurrentHashJoin() override = default;
|
|
|
|
|
|
|
|
const TableJoin & getTableJoin() const override { return *table_join; }
|
|
|
|
bool addJoinedBlock(const Block & block, bool check_limits) override;
|
|
|
|
void checkTypesOfKeys(const Block & block) const override;
|
|
|
|
void joinBlock(Block & block, std::shared_ptr<ExtraBlock> & not_processed) override;
|
|
|
|
void setTotals(const Block & block) override;
|
|
|
|
const Block & getTotals() const override;
|
|
|
|
size_t getTotalRowCount() const override;
|
|
|
|
size_t getTotalByteCount() const override;
|
|
|
|
bool alwaysReturnsEmptySet() const override;
|
|
|
|
bool supportParallelJoin() const override { return true; }
|
2022-10-04 08:21:02 +00:00
|
|
|
std::unique_ptr<IBlocksStream>
|
2022-04-19 08:07:30 +00:00
|
|
|
getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const override;
|
2022-05-06 15:17:46 +00:00
|
|
|
|
2022-04-19 08:07:30 +00:00
|
|
|
private:
|
2022-04-21 04:14:36 +00:00
|
|
|
struct InternalHashJoin
|
2022-04-19 08:07:30 +00:00
|
|
|
{
|
|
|
|
std::mutex mutex;
|
|
|
|
std::unique_ptr<HashJoin> data;
|
|
|
|
};
|
2022-04-21 05:19:33 +00:00
|
|
|
|
2022-04-19 08:07:30 +00:00
|
|
|
ContextPtr context;
|
|
|
|
std::shared_ptr<TableJoin> table_join;
|
|
|
|
size_t slots;
|
2022-04-21 04:14:36 +00:00
|
|
|
std::vector<std::shared_ptr<InternalHashJoin>> hash_joins;
|
2022-04-19 08:07:30 +00:00
|
|
|
|
2022-06-14 14:13:01 +00:00
|
|
|
std::mutex totals_mutex;
|
2022-04-19 08:07:30 +00:00
|
|
|
Block totals;
|
|
|
|
|
2022-06-14 14:13:01 +00:00
|
|
|
IColumn::Selector selectDispatchBlock(const Strings & key_columns_names, const Block & from_block);
|
2022-05-05 04:04:11 +00:00
|
|
|
Blocks dispatchBlock(const Strings & key_columns_names, const Block & from_block);
|
2022-04-19 08:07:30 +00:00
|
|
|
|
|
|
|
};
|
2022-05-06 15:17:46 +00:00
|
|
|
|
2022-04-19 08:07:30 +00:00
|
|
|
}
|