mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-12-04 21:42:39 +00:00
106 lines
3.8 KiB
C++
106 lines
3.8 KiB
C++
|
#pragma once
|
||
|
|
||
|
#include <condition_variable>
|
||
|
#include <memory>
|
||
|
#include <optional>
|
||
|
#include <Interpreters/HashJoin.h>
|
||
|
#include "Core/BackgroundSchedulePool.h"
|
||
|
#include "Functions/FunctionsLogical.h"
|
||
|
#include "Interpreters/ExpressionActions.h"
|
||
|
#include "Interpreters/IJoin.h"
|
||
|
#include "base/defines.h"
|
||
|
#include "base/types.h"
|
||
|
#include <Common/Stopwatch.h>
|
||
|
#include <Interpreters/Context.h>
|
||
|
namespace DB
|
||
|
{
|
||
|
namespace JoinStuff
|
||
|
{
|
||
|
/**
|
||
|
* Can run addJoinedBlock() parallelly to speedup the join process. On test, it almose linear speedup by
|
||
|
* the degree of parallelism.
|
||
|
*
|
||
|
* The default HashJoin is not thread safe for inserting right table's rows and run it in a single thread. When
|
||
|
* the right table is large, the join process is too slow.
|
||
|
*
|
||
|
* We create multiple HashJoin instances here. In addJoinedBlock(), one input block is split into multiple blocks
|
||
|
* corresponding to the HashJoin instances by hashing every row on the join keys. And make a guarantee that every HashJoin
|
||
|
* instance is written by only one thread.
|
||
|
*
|
||
|
* When come to the left table matching, the blocks from left table are alse split into different HashJoin instances.
|
||
|
*
|
||
|
*/
|
||
|
class ConcurrentHashJoin : public IJoin
|
||
|
{
|
||
|
public:
|
||
|
explicit ConcurrentHashJoin(ContextPtr context_, std::shared_ptr<TableJoin> table_join_, size_t slots_, const Block & right_sample_block, bool any_take_last_row_ = false);
|
||
|
~ConcurrentHashJoin() override = default;
|
||
|
|
||
|
const TableJoin & getTableJoin() const override { return *table_join; }
|
||
|
bool addJoinedBlock(const Block & block, bool check_limits) override;
|
||
|
void checkTypesOfKeys(const Block & block) const override;
|
||
|
void joinBlock(Block & block, std::shared_ptr<ExtraBlock> & not_processed) override;
|
||
|
void setTotals(const Block & block) override;
|
||
|
const Block & getTotals() const override;
|
||
|
size_t getTotalRowCount() const override;
|
||
|
size_t getTotalByteCount() const override;
|
||
|
bool alwaysReturnsEmptySet() const override;
|
||
|
bool supportParallelJoin() const override { return true; }
|
||
|
std::shared_ptr<NotJoinedBlocks>
|
||
|
getNonJoinedBlocks(const Block & left_sample_block, const Block & result_sample_block, UInt64 max_block_size) const override;
|
||
|
private:
|
||
|
struct InnerHashJoin
|
||
|
{
|
||
|
std::mutex mutex;
|
||
|
std::unique_ptr<HashJoin> data;
|
||
|
std::list<Block> pending_right_blocks;
|
||
|
std::atomic<bool> in_inserting = false;
|
||
|
std::atomic<UInt32> rows = 0;
|
||
|
|
||
|
};
|
||
|
ContextPtr context;
|
||
|
std::shared_ptr<TableJoin> table_join;
|
||
|
size_t slots;
|
||
|
std::vector<std::shared_ptr<InnerHashJoin>> hash_joins;
|
||
|
std::atomic<size_t> check_total_rows;
|
||
|
std::atomic<size_t> check_total_bytes;
|
||
|
|
||
|
std::mutex finished_add_joined_blocks_tasks_mutex;
|
||
|
std::condition_variable finished_add_joined_blocks_tasks_cond;
|
||
|
std::atomic<UInt32> finished_add_joined_blocks_tasks = 0;
|
||
|
|
||
|
mutable std::mutex totals_mutex;
|
||
|
Block totals;
|
||
|
|
||
|
enum TableIndex
|
||
|
{
|
||
|
LEFT = 0,
|
||
|
RIGHT = 1
|
||
|
};
|
||
|
|
||
|
struct BlockDispatchControlData
|
||
|
{
|
||
|
std::mutex mutex;
|
||
|
std::atomic<bool> has_init = false;
|
||
|
std::shared_ptr<ExpressionActions> hash_expression_actions;
|
||
|
Strings hash_columns_names;
|
||
|
Block header;
|
||
|
BlockDispatchControlData() = default;
|
||
|
};
|
||
|
|
||
|
std::vector<std::shared_ptr<BlockDispatchControlData>> dispatch_datas;
|
||
|
|
||
|
Poco::Logger * logger = &Poco::Logger::get("ConcurrentHashJoin");
|
||
|
|
||
|
std::shared_ptr<ExpressionActions> buildHashExpressionAction(const Block & block, const Strings & based_columns_names, Strings & hash_columns_names);
|
||
|
BlockDispatchControlData & getBlockDispatchControlData(const Block & block, TableIndex table_index);
|
||
|
|
||
|
static void dispatchBlock(BlockDispatchControlData & dispatch_data, Block & from_block, std::vector<Block> & dispatched_blocks);
|
||
|
|
||
|
void waitAllAddJoinedBlocksFinished();
|
||
|
std::shared_ptr<InnerHashJoin> getUnfinishedAddJoinedBlockTaks();
|
||
|
|
||
|
};
|
||
|
}
|
||
|
}
|