ClickHouse/src/Storages/StorageJoin.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

671 lines
26 KiB
C++
Raw Normal View History

#include <Storages/StorageJoin.h>
#include <Storages/StorageFactory.h>
2021-04-18 09:38:50 +00:00
#include <Storages/StorageSet.h>
#include <Storages/TableLockHolder.h>
#include <Interpreters/HashJoin.h>
#include <Interpreters/Context.h>
#include <Parsers/ASTCreateQuery.h>
#include <Parsers/ASTIdentifier_fwd.h>
#include <Core/ColumnNumbers.h>
#include <DataTypes/NestedUtils.h>
#include <Interpreters/joinDispatch.h>
2021-04-18 09:38:50 +00:00
#include <Interpreters/MutationsInterpreter.h>
#include <Interpreters/TableJoin.h>
#include <Interpreters/castColumn.h>
2019-12-30 21:30:25 +00:00
#include <Common/quoteString.h>
2021-04-18 09:38:50 +00:00
#include <Common/Exception.h>
#include <Core/ColumnsWithTypeAndName.h>
2022-08-04 15:20:19 +00:00
#include <Interpreters/JoinUtils.h>
2021-04-18 09:38:50 +00:00
#include <Compression/CompressedWriteBuffer.h>
2022-05-20 19:49:31 +00:00
#include <Processors/ISource.h>
2021-10-16 14:03:50 +00:00
#include <QueryPipeline/Pipe.h>
2022-05-24 20:06:08 +00:00
#include <QueryPipeline/QueryPipelineBuilder.h>
2021-10-06 17:59:27 +00:00
#include <Processors/Executors/PullingPipelineExecutor.h>
#include <Poco/String.h>
#include <filesystem>
namespace fs = std::filesystem;
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int DEADLOCK_AVOIDED;
extern const int INCOMPATIBLE_TYPE_OF_JOIN;
2020-02-25 18:02:41 +00:00
extern const int LOGICAL_ERROR;
extern const int NO_SUCH_COLUMN_IN_TABLE;
extern const int NOT_IMPLEMENTED;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int UNSUPPORTED_JOIN_KEYS;
}
StorageJoin::StorageJoin(
2020-12-15 16:45:13 +00:00
DiskPtr disk_,
2019-10-25 19:07:47 +00:00
const String & relative_path_,
2019-12-04 16:06:55 +00:00
const StorageID & table_id_,
const Names & key_names_,
bool use_nulls_,
SizeLimits limits_,
2022-07-29 16:30:50 +00:00
JoinKind kind_,
JoinStrictness strictness_,
const ColumnsDescription & columns_,
2019-08-24 21:20:20 +00:00
const ConstraintsDescription & constraints_,
2021-04-23 12:18:23 +00:00
const String & comment,
bool overwrite_,
2020-09-18 12:58:27 +00:00
bool persistent_)
2021-04-23 12:18:23 +00:00
: StorageSetOrJoinBase{disk_, relative_path_, table_id_, columns_, constraints_, comment, persistent_}
, key_names(key_names_)
, use_nulls(use_nulls_)
, limits(limits_)
, kind(kind_)
, strictness(strictness_)
, overwrite(overwrite_)
{
auto metadata_snapshot = getInMemoryMetadataPtr();
for (const auto & key : key_names)
if (!metadata_snapshot->getColumns().hasPhysical(key))
throw Exception(ErrorCodes::NO_SUCH_COLUMN_IN_TABLE, "Key column ({}) does not exist in table declaration.", key);
table_join = std::make_shared<TableJoin>(limits, use_nulls, kind, strictness, key_names);
join = std::make_shared<HashJoin>(table_join, getRightSampleBlock(), overwrite);
restore();
}
2021-09-30 08:47:15 +00:00
RWLockImpl::LockHolder StorageJoin::tryLockTimedWithContext(const RWLock & lock, RWLockImpl::Type type, ContextPtr context) const
{
2021-09-30 08:47:15 +00:00
const String query_id = context ? context->getInitialQueryId() : RWLockImpl::NO_QUERY;
const std::chrono::milliseconds acquire_timeout
2021-09-30 08:47:15 +00:00
= context ? context->getSettingsRef().lock_acquire_timeout : std::chrono::seconds(DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC);
return tryLockTimed(lock, type, query_id, acquire_timeout);
}
2023-03-13 10:49:51 +00:00
RWLockImpl::LockHolder StorageJoin::tryLockForCurrentQueryTimedWithContext(const RWLock & lock, RWLockImpl::Type type, ContextPtr context)
{
const String query_id = context ? context->getInitialQueryId() : RWLockImpl::NO_QUERY;
const std::chrono::milliseconds acquire_timeout
= context ? context->getSettingsRef().lock_acquire_timeout : std::chrono::seconds(DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC);
return lock->getLock(type, query_id, acquire_timeout, false);
}
SinkToStoragePtr StorageJoin::write(const ASTPtr & query, const StorageMetadataPtr & metadata_snapshot, ContextPtr context, bool /*async_insert*/)
2021-05-31 23:22:05 +00:00
{
std::lock_guard mutate_lock(mutate_mutex);
return StorageSetOrJoinBase::write(query, metadata_snapshot, context, /*async_insert=*/false);
2021-05-31 23:22:05 +00:00
}
void StorageJoin::truncate(const ASTPtr &, const StorageMetadataPtr &, ContextPtr context, TableExclusiveLockHolder &)
2018-06-09 16:09:37 +00:00
{
2021-05-31 23:22:05 +00:00
std::lock_guard mutate_lock(mutate_mutex);
2021-09-30 08:47:15 +00:00
TableLockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Write, context);
2021-05-31 23:22:05 +00:00
if (disk->exists(path))
disk->removeRecursive(path);
else
2024-01-23 17:04:50 +00:00
LOG_INFO(getLogger("StorageJoin"), "Path {} is already removed from disk {}", path, disk->getName());
2020-12-15 16:45:13 +00:00
disk->createDirectories(path);
disk->createDirectories(fs::path(path) / "tmp/");
2018-06-09 16:09:37 +00:00
increment = 0;
join = std::make_shared<HashJoin>(table_join, getRightSampleBlock(), overwrite);
}
2018-06-09 16:09:37 +00:00
2021-04-18 09:38:50 +00:00
void StorageJoin::checkMutationIsPossible(const MutationCommands & commands, const Settings & /* settings */) const
{
2021-05-31 23:22:05 +00:00
for (const auto & command : commands)
if (command.type != MutationCommand::DELETE)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Table engine Join supports only DELETE mutations");
2021-04-18 09:38:50 +00:00
}
2023-01-30 17:38:28 +00:00
void StorageJoin::mutate(const MutationCommands & commands, ContextPtr context)
2021-04-18 09:38:50 +00:00
{
2021-06-28 17:02:22 +00:00
/// Firstly acquire lock for mutation, that locks changes of data.
/// We cannot acquire rwlock here, because read lock is needed
2021-05-31 23:22:05 +00:00
/// for execution of mutation interpreter.
std::lock_guard mutate_lock(mutate_mutex);
2021-04-18 09:38:50 +00:00
2021-05-31 23:22:05 +00:00
constexpr auto tmp_backup_file_name = "tmp/mut.bin";
auto metadata_snapshot = getInMemoryMetadataPtr();
2021-04-18 09:38:50 +00:00
2021-05-31 23:22:05 +00:00
auto backup_buf = disk->writeFile(path + tmp_backup_file_name);
2021-04-18 09:38:50 +00:00
auto compressed_backup_buf = CompressedWriteBuffer(*backup_buf);
2021-10-08 17:21:19 +00:00
auto backup_stream = NativeWriter(compressed_backup_buf, 0, metadata_snapshot->getSampleBlock());
2021-04-18 09:38:50 +00:00
auto new_data = std::make_shared<HashJoin>(table_join, getRightSampleBlock(), overwrite);
2021-05-31 23:22:05 +00:00
2022-05-09 19:13:02 +00:00
// New scope controls lifetime of pipeline.
2021-04-19 03:22:35 +00:00
{
2021-05-31 23:22:05 +00:00
auto storage_ptr = DatabaseCatalog::instance().getTable(getStorageID(), context);
2023-05-25 22:54:54 +00:00
MutationsInterpreter::Settings settings(true);
auto interpreter = std::make_unique<MutationsInterpreter>(storage_ptr, metadata_snapshot, commands, context, settings);
2022-05-24 20:06:08 +00:00
auto pipeline = QueryPipelineBuilder::getPipeline(interpreter->execute());
2021-10-06 17:59:27 +00:00
PullingPipelineExecutor executor(pipeline);
2021-05-31 23:22:05 +00:00
2021-10-06 17:59:27 +00:00
Block block;
while (executor.pull(block))
2021-05-31 23:22:05 +00:00
{
2023-07-05 17:03:18 +00:00
new_data->addBlockToJoin(block, true);
2021-05-31 23:22:05 +00:00
if (persistent)
backup_stream.write(block);
}
2021-04-18 09:38:50 +00:00
}
2021-06-28 17:02:22 +00:00
/// Now acquire exclusive lock and modify storage.
TableLockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Write, context);
2021-05-31 23:22:05 +00:00
2021-04-18 09:38:50 +00:00
join = std::move(new_data);
increment = 1;
2021-04-19 03:22:35 +00:00
if (persistent)
{
2021-04-18 09:38:50 +00:00
backup_stream.flush();
compressed_backup_buf.next();
backup_buf->next();
backup_buf->finalize();
std::vector<std::string> files;
disk->listFiles(path, files);
2021-04-19 03:22:35 +00:00
for (const auto & file_name: files)
{
if (file_name.ends_with(".bin"))
2021-04-18 09:38:50 +00:00
disk->removeFileIfExists(path + file_name);
}
2021-05-31 23:22:05 +00:00
disk->replaceFile(path + tmp_backup_file_name, path + std::to_string(increment) + ".bin");
2021-04-18 09:38:50 +00:00
}
}
2018-06-09 16:09:37 +00:00
HashJoinPtr StorageJoin::getJoinLocked(std::shared_ptr<TableJoin> analyzed_join, ContextPtr context, const Names & required_columns_names) const
{
auto metadata_snapshot = getInMemoryMetadataPtr();
2020-02-02 16:12:48 +00:00
if (!analyzed_join->sameStrictnessAndKind(strictness, kind))
throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN, "Table '{}' has incompatible type of JOIN", getStorageID().getNameForLogs());
2019-12-19 15:50:28 +00:00
2019-12-30 20:08:03 +00:00
if ((analyzed_join->forceNullableRight() && !use_nulls) ||
(!analyzed_join->forceNullableRight() && isLeftOrFull(analyzed_join->kind()) && use_nulls))
throw Exception(
ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN,
"Table {} needs the same join_use_nulls setting as present in LEFT or FULL JOIN",
getStorageID().getNameForLogs());
2022-12-06 11:45:41 +00:00
if (analyzed_join->getClauses().size() != 1)
throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN, "JOIN keys should match to the Join engine keys [{}]",
fmt::join(getKeyNames(), ", "));
2022-12-06 11:45:41 +00:00
2022-12-07 16:27:01 +00:00
const auto & join_on = analyzed_join->getOnlyClause();
if (join_on.on_filter_condition_left || join_on.on_filter_condition_right)
throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN, "ON section of JOIN with filter conditions is not implemented");
const auto & key_names_right = join_on.key_names_right;
const auto & key_names_left = join_on.key_names_left;
2022-12-07 15:02:37 +00:00
if (key_names.size() != key_names_right.size() || key_names.size() != key_names_left.size())
throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN,
"Number of keys in JOIN ON section ({}) doesn't match number of keys in Join engine ({})",
key_names_right.size(), key_names.size());
/* Resort left keys according to right keys order in StorageJoin
* We can't change the order of keys in StorageJoin
* because the hash table was already built with tuples serialized in the order of key_names.
* If we try to use the same hash table with different order of keys,
* then calculated hashes and the result of the comparison will be wrong.
2022-12-07 15:02:37 +00:00
*
* Example:
* ```
* CREATE TABLE t_right (a UInt32, b UInt32) ENGINE = Join(ALL, INNER, a, b);
* SELECT * FROM t_left JOIN t_right ON t_left.y = t_right.b AND t_left.x = t_right.a;
* ```
* In that case right keys should still be (a, b), need to change the order of the left keys to (x, y).
*/
Names left_key_names_resorted;
for (const auto & key_name : key_names)
2022-12-07 15:02:37 +00:00
{
const auto & renamed_key = analyzed_join->renamedRightColumnNameWithAlias(key_name);
2022-12-07 15:02:37 +00:00
/// find position of renamed_key in key_names_right
auto it = std::find(key_names_right.begin(), key_names_right.end(), renamed_key);
if (it == key_names_right.end())
throw Exception(ErrorCodes::INCOMPATIBLE_TYPE_OF_JOIN,
"Key '{}' not found in JOIN ON section. Join engine key{} '{}' have to be used",
key_name, key_names.size() > 1 ? "s" : "", fmt::join(key_names, ", "));
const size_t key_position = std::distance(key_names_right.begin(), it);
2022-12-07 15:02:37 +00:00
left_key_names_resorted.push_back(key_names_left[key_position]);
}
2019-12-19 15:50:28 +00:00
2022-12-06 11:45:41 +00:00
/// Set qualified identifiers to original names (table.column -> column).
/// It's required because storage join stores non-qualified names.
/// Qualifies will be added by join implementation (TableJoin contains a rename mapping).
analyzed_join->setRightKeys(key_names);
2022-12-07 15:02:37 +00:00
analyzed_join->setLeftKeys(left_key_names_resorted);
Block right_sample_block;
for (const auto & name : required_columns_names)
right_sample_block.insert(getRightSampleBlock().getByName(name));
HashJoinPtr join_clone = std::make_shared<HashJoin>(analyzed_join, right_sample_block);
2021-09-30 08:47:15 +00:00
RWLockImpl::LockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Read, context);
join_clone->setLock(holder);
2021-02-24 06:48:12 +00:00
join_clone->reuseJoinedData(*join);
2019-12-19 15:50:28 +00:00
return join_clone;
}
2021-09-30 08:47:15 +00:00
void StorageJoin::insertBlock(const Block & block, ContextPtr context)
{
Block block_to_insert = block;
convertRightBlock(block_to_insert);
TableLockHolder holder = tryLockForCurrentQueryTimedWithContext(rwlock, RWLockImpl::Write, context);
/// Protection from `INSERT INTO test_table_join SELECT * FROM test_table_join`
if (!holder)
throw Exception(ErrorCodes::DEADLOCK_AVOIDED, "StorageJoin: cannot insert data because current query tries to read from this storage");
2023-07-05 17:03:18 +00:00
join->addBlockToJoin(block_to_insert, true);
}
2021-09-30 08:47:15 +00:00
size_t StorageJoin::getSize(ContextPtr context) const
{
2021-09-30 08:47:15 +00:00
TableLockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Read, context);
return join->getTotalRowCount();
}
std::optional<UInt64> StorageJoin::totalRows(const Settings &settings) const
{
TableLockHolder holder = tryLockTimed(rwlock, RWLockImpl::Read, RWLockImpl::NO_QUERY, settings.lock_acquire_timeout);
return join->getTotalRowCount();
}
std::optional<UInt64> StorageJoin::totalBytes(const Settings &settings) const
{
TableLockHolder holder = tryLockTimed(rwlock, RWLockImpl::Read, RWLockImpl::NO_QUERY, settings.lock_acquire_timeout);
return join->getTotalByteCount();
}
Squashed commit of the following: commit e712f469a55ff34ad34b482b15cc4153b7ad7233 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:59:13 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 2a002823084e3a79bffcc17d479620a68eb0644b Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:58:30 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 9e06f407c8ee781ed8ddf98bdfcc31846bf2a0fe Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:55:14 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 9581620f1e839f456fa7894aa1f996d5162ac6cd Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:54:22 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 2a8564c68cb6cc3649fafaf401256d43c9a2e777 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:47:34 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit cf60632d78ec656be3304ef4565e859bb6ce80ba Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:40:09 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit ee3d1dc6e0c4ca60e3ac1e0c30d4b3ed1e66eca0 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:22:49 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 65592ef7116a90104fcd524b53ef8b7cf22640f2 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:18:17 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 37972c257320d3b7e7b294e0fdeffff218647bfd Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:17:06 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit dd909d149974ce5bed2456de1261aa5a368fd3ff Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:16:28 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 3cf43266ca7e30adf01212b1a739ba5fe43639fd Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:15:42 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 6731a3df96d1609286e2536b6432916af7743f0f Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:13:35 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 1b5727e0d56415b7add4cb76110105358663602c Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:11:18 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit bbcf726a55685b8e72f5b40ba0bf1904bd1c0407 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:09:04 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit c03b477d5e2e65014e8906ecfa2efb67ee295af1 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:06:30 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 2986e2fb0466bc18d73693dcdded28fccc0dc66b Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:05:44 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit 5d6cdef13d2e02bd5c4954983334e9162ab2635b Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:04:53 2017 +0300 Less dependencies [#CLICKHOUSE-2] commit f2b819b25ce8b2ccdcb201eefb03e1e6f5aab590 Author: Alexey Milovidov <milovidov@yandex-team.ru> Date: Sat Jan 14 11:01:47 2017 +0300 Less dependencies [#CLICKHOUSE-2]
2017-01-14 09:00:19 +00:00
DataTypePtr StorageJoin::joinGetCheckAndGetReturnType(const DataTypes & data_types, const String & column_name, bool or_null) const
{
return join->joinGetCheckAndGetReturnType(data_types, column_name, or_null);
}
2021-09-30 08:47:15 +00:00
ColumnWithTypeAndName StorageJoin::joinGet(const Block & block, const Block & block_with_columns_to_add, ContextPtr context) const
{
2021-09-30 08:47:15 +00:00
TableLockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Read, context);
return join->joinGet(block, block_with_columns_to_add);
}
void StorageJoin::convertRightBlock(Block & block) const
{
bool need_covert = use_nulls && isLeftOrFull(kind);
if (!need_covert)
return;
for (auto & col : block)
JoinCommon::convertColumnToNullable(col);
}
void registerStorageJoin(StorageFactory & factory)
{
auto creator_fn = [](const StorageFactory::Arguments & args)
{
/// Join(ANY, LEFT, k1, k2, ...)
ASTs & engine_args = args.engine_args;
const auto & settings = args.getContext()->getSettingsRef();
auto join_use_nulls = settings.join_use_nulls;
auto max_rows_in_join = settings.max_rows_in_join;
auto max_bytes_in_join = settings.max_bytes_in_join;
auto join_overflow_mode = settings.join_overflow_mode;
2019-01-24 17:12:05 +00:00
auto join_any_take_last_row = settings.join_any_take_last_row;
auto old_any_join = settings.any_join_distinct_right_table_keys;
2020-09-19 16:38:36 +00:00
bool persistent = true;
2020-12-15 16:45:13 +00:00
String disk_name = "default";
if (args.storage_def && args.storage_def->settings)
{
for (const auto & setting : args.storage_def->settings->changes)
{
2018-12-18 01:26:12 +00:00
if (setting.name == "join_use_nulls")
join_use_nulls = setting.value;
2018-12-18 01:26:12 +00:00
else if (setting.name == "max_rows_in_join")
max_rows_in_join = setting.value;
2018-12-18 01:26:12 +00:00
else if (setting.name == "max_bytes_in_join")
max_bytes_in_join = setting.value;
2018-12-18 01:26:12 +00:00
else if (setting.name == "join_overflow_mode")
join_overflow_mode = setting.value;
2019-01-24 17:12:05 +00:00
else if (setting.name == "join_any_take_last_row")
join_any_take_last_row = setting.value;
else if (setting.name == "any_join_distinct_right_table_keys")
old_any_join = setting.value;
2020-12-15 16:45:13 +00:00
else if (setting.name == "disk")
disk_name = setting.value.get<String>();
2020-09-18 12:58:27 +00:00
else if (setting.name == "persistent")
2020-09-15 09:16:10 +00:00
{
persistent = setting.value.get<bool>();
2020-09-15 09:16:10 +00:00
}
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unknown setting {} for storage {}", setting.name, args.engine_name);
}
}
DiskPtr disk = args.getContext()->getDisk(disk_name);
2020-12-15 16:45:13 +00:00
if (engine_args.size() < 3)
throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH,
"Storage Join requires at least 3 parameters: "
"Join(ANY|ALL|SEMI|ANTI, LEFT|INNER|RIGHT, keys...).");
2022-07-29 16:30:50 +00:00
JoinStrictness strictness = JoinStrictness::Unspecified;
JoinKind kind = JoinKind::Comma;
if (auto opt_strictness_id = tryGetIdentifierName(engine_args[0]))
{
const String strictness_str = Poco::toLower(*opt_strictness_id);
2019-12-27 12:41:55 +00:00
if (strictness_str == "any")
{
if (old_any_join)
2022-07-29 16:30:50 +00:00
strictness = JoinStrictness::RightAny;
else
2022-07-29 16:30:50 +00:00
strictness = JoinStrictness::Any;
}
2019-12-27 12:41:55 +00:00
else if (strictness_str == "all")
2022-07-29 16:30:50 +00:00
strictness = JoinStrictness::All;
2019-12-27 12:41:55 +00:00
else if (strictness_str == "semi")
2022-07-29 16:30:50 +00:00
strictness = JoinStrictness::Semi;
2019-12-27 12:41:55 +00:00
else if (strictness_str == "anti")
2022-07-29 16:30:50 +00:00
strictness = JoinStrictness::Anti;
}
2022-07-29 16:30:50 +00:00
if (strictness == JoinStrictness::Unspecified)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "First parameter of storage Join must be ANY or ALL or SEMI or ANTI (without quotes).");
if (auto opt_kind_id = tryGetIdentifierName(engine_args[1]))
{
const String kind_str = Poco::toLower(*opt_kind_id);
2019-12-27 12:41:55 +00:00
if (kind_str == "left")
2022-07-29 16:30:50 +00:00
kind = JoinKind::Left;
2019-12-27 12:41:55 +00:00
else if (kind_str == "inner")
2022-07-29 16:30:50 +00:00
kind = JoinKind::Inner;
2019-12-27 12:41:55 +00:00
else if (kind_str == "right")
2022-07-29 16:30:50 +00:00
kind = JoinKind::Right;
2019-12-27 12:41:55 +00:00
else if (kind_str == "full")
{
2022-07-29 16:30:50 +00:00
if (strictness == JoinStrictness::Any)
strictness = JoinStrictness::RightAny;
kind = JoinKind::Full;
}
}
2022-07-29 16:30:50 +00:00
if (kind == JoinKind::Comma)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second parameter of storage Join must be LEFT or INNER or RIGHT or FULL (without quotes).");
Names key_names;
key_names.reserve(engine_args.size() - 2);
for (size_t i = 2, size = engine_args.size(); i < size; ++i)
{
auto opt_key = tryGetIdentifierName(engine_args[i]);
if (!opt_key)
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Parameter №{} of storage Join don't look like column name.", i + 1);
key_names.push_back(*opt_key);
}
return std::make_shared<StorageJoin>(
2020-12-15 16:45:13 +00:00
disk,
2019-10-25 19:07:47 +00:00
args.relative_data_path,
2019-12-04 16:06:55 +00:00
args.table_id,
key_names,
2019-08-09 13:02:01 +00:00
join_use_nulls,
SizeLimits{max_rows_in_join, max_bytes_in_join, join_overflow_mode},
kind,
strictness,
args.columns,
2019-08-24 21:20:20 +00:00
args.constraints,
2021-04-23 12:18:23 +00:00
args.comment,
2019-10-25 19:07:47 +00:00
join_any_take_last_row,
2020-09-19 16:38:36 +00:00
persistent);
};
factory.registerStorage("Join", creator_fn, StorageFactory::StorageFeatures{ .supports_settings = true, });
}
template <typename T>
static const char * rawData(T & t)
{
return reinterpret_cast<const char *>(&t);
}
template <typename T>
static size_t rawSize(T &)
{
return sizeof(T);
}
template <>
const char * rawData(const StringRef & t)
{
return t.data;
}
template <>
size_t rawSize(const StringRef & t)
{
return t.size;
}
2022-05-20 19:49:31 +00:00
class JoinSource : public ISource
{
public:
JoinSource(HashJoinPtr join_, TableLockHolder lock_holder_, UInt64 max_block_size_, Block sample_block_)
2022-05-20 19:49:31 +00:00
: ISource(sample_block_)
, join(join_)
, lock_holder(lock_holder_)
, max_block_size(max_block_size_)
, sample_block(std::move(sample_block_))
{
2021-09-10 14:52:44 +00:00
if (!join->getTableJoin().oneDisjunct())
throw DB::Exception(ErrorCodes::NOT_IMPLEMENTED, "StorageJoin does not support OR for keys in JOIN ON section");
column_indices.resize(sample_block.columns());
2020-06-01 17:27:22 +00:00
auto & saved_block = join->getJoinedData()->sample_block;
2020-06-01 17:27:22 +00:00
for (size_t i = 0; i < sample_block.columns(); ++i)
{
auto & [_, type, name] = sample_block.getByPosition(i);
if (join->right_table_keys.has(name))
{
key_pos = i;
const auto & column = join->right_table_keys.getByName(name);
2020-06-01 17:27:22 +00:00
restored_block.insert(column);
}
else
{
2020-06-01 17:27:22 +00:00
size_t pos = saved_block.getPositionByName(name);
column_indices[i] = pos;
2020-06-01 17:27:22 +00:00
2020-06-01 19:38:51 +00:00
const auto & column = saved_block.getByPosition(pos);
2020-06-01 17:27:22 +00:00
restored_block.insert(column);
}
}
}
String getName() const override { return "Join"; }
protected:
Chunk generate() override
{
if (join->data->blocks.empty())
return {};
Chunk chunk;
2021-06-25 12:03:10 +00:00
if (!joinDispatch(join->kind, join->strictness, join->data->maps.front(),
[&](auto kind, auto strictness, auto & map) { chunk = createChunk<kind, strictness>(map); }))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown JOIN strictness");
return chunk;
}
private:
HashJoinPtr join;
TableLockHolder lock_holder;
2019-02-10 16:55:12 +00:00
UInt64 max_block_size;
Block sample_block;
2020-06-01 17:27:22 +00:00
Block restored_block; /// sample_block with parent column types
ColumnNumbers column_indices;
std::optional<size_t> key_pos;
std::unique_ptr<void, std::function<void(void *)>> position; /// type erasure
2022-07-29 16:30:50 +00:00
template <JoinKind KIND, JoinStrictness STRICTNESS, typename Maps>
Chunk createChunk(const Maps & maps)
{
MutableColumns mut_columns = restored_block.cloneEmpty().mutateColumns();
size_t rows_added = 0;
switch (join->data->type)
{
#define M(TYPE) \
case HashJoin::Type::TYPE: \
rows_added = fillColumns<KIND, STRICTNESS>(*maps.TYPE, mut_columns); \
break;
APPLY_FOR_JOIN_VARIANTS_LIMITED(M)
#undef M
default:
2023-12-11 15:50:27 +00:00
throw Exception(ErrorCodes::UNSUPPORTED_JOIN_KEYS, "Unsupported JOIN keys of type {} in StorageJoin", join->data->type);
}
if (!rows_added)
return {};
Columns columns;
columns.reserve(mut_columns.size());
for (auto & col : mut_columns)
columns.emplace_back(std::move(col));
/// Correct nullability and LowCardinality types
for (size_t i = 0; i < columns.size(); ++i)
2020-06-01 17:27:22 +00:00
{
const auto & src = restored_block.getByPosition(i);
const auto & dst = sample_block.getByPosition(i);
2020-06-01 17:27:22 +00:00
if (!src.type->equals(*dst.type))
{
auto arg = src;
arg.column = std::move(columns[i]);
columns[i] = castColumn(arg, dst.type);
}
2020-06-01 17:27:22 +00:00
}
2020-06-01 17:27:22 +00:00
UInt64 num_rows = columns.at(0)->size();
return Chunk(std::move(columns), num_rows);
}
2022-07-29 16:30:50 +00:00
template <JoinKind KIND, JoinStrictness STRICTNESS, typename Map>
2020-06-01 17:27:22 +00:00
size_t fillColumns(const Map & map, MutableColumns & columns)
{
size_t rows_added = 0;
if (!position)
position = decltype(position)(
2023-02-19 22:15:09 +00:00
static_cast<void *>(new typename Map::const_iterator(map.begin())),
[](void * ptr) { delete reinterpret_cast<typename Map::const_iterator *>(ptr); });
auto & it = *reinterpret_cast<typename Map::const_iterator *>(position.get());
auto end = map.end();
for (; it != end; ++it)
{
2022-07-29 16:30:50 +00:00
if constexpr (STRICTNESS == JoinStrictness::RightAny)
{
fillOne<Map>(columns, column_indices, it, key_pos, rows_added);
}
2022-07-29 16:30:50 +00:00
else if constexpr (STRICTNESS == JoinStrictness::All)
{
fillAll<Map>(columns, column_indices, it, key_pos, rows_added);
}
2022-07-29 16:30:50 +00:00
else if constexpr (STRICTNESS == JoinStrictness::Any)
{
2022-07-29 16:30:50 +00:00
if constexpr (KIND == JoinKind::Left || KIND == JoinKind::Inner)
fillOne<Map>(columns, column_indices, it, key_pos, rows_added);
2022-07-29 16:30:50 +00:00
else if constexpr (KIND == JoinKind::Right)
fillAll<Map>(columns, column_indices, it, key_pos, rows_added);
}
2022-07-29 16:30:50 +00:00
else if constexpr (STRICTNESS == JoinStrictness::Semi)
{
2022-07-29 16:30:50 +00:00
if constexpr (KIND == JoinKind::Left)
fillOne<Map>(columns, column_indices, it, key_pos, rows_added);
2022-07-29 16:30:50 +00:00
else if constexpr (KIND == JoinKind::Right)
fillAll<Map>(columns, column_indices, it, key_pos, rows_added);
}
2022-07-29 16:30:50 +00:00
else if constexpr (STRICTNESS == JoinStrictness::Anti)
{
2022-07-29 16:30:50 +00:00
if constexpr (KIND == JoinKind::Left)
fillOne<Map>(columns, column_indices, it, key_pos, rows_added);
2022-07-29 16:30:50 +00:00
else if constexpr (KIND == JoinKind::Right)
fillAll<Map>(columns, column_indices, it, key_pos, rows_added);
}
else
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "This JOIN is not implemented yet");
if (rows_added >= max_block_size)
{
++it;
break;
}
}
return rows_added;
}
template <typename Map>
static void fillOne(MutableColumns & columns, const ColumnNumbers & column_indices, typename Map::const_iterator & it,
const std::optional<size_t> & key_pos, size_t & rows_added)
{
for (size_t j = 0; j < columns.size(); ++j)
if (j == key_pos)
columns[j]->insertData(rawData(it->getKey()), rawSize(it->getKey()));
else
columns[j]->insertFrom(*it->getMapped().block->getByPosition(column_indices[j]).column.get(), it->getMapped().row_num);
++rows_added;
}
template <typename Map>
static void fillAll(MutableColumns & columns, const ColumnNumbers & column_indices, typename Map::const_iterator & it,
const std::optional<size_t> & key_pos, size_t & rows_added)
{
for (auto ref_it = it->getMapped().begin(); ref_it.ok(); ++ref_it)
{
for (size_t j = 0; j < columns.size(); ++j)
if (j == key_pos)
columns[j]->insertData(rawData(it->getKey()), rawSize(it->getKey()));
else
columns[j]->insertFrom(*ref_it->block->getByPosition(column_indices[j]).column.get(), ref_it->row_num);
++rows_added;
}
}
};
// TODO: multiple stream read and index read
2020-08-03 13:54:14 +00:00
Pipe StorageJoin::read(
const Names & column_names,
const StorageSnapshotPtr & storage_snapshot,
SelectQueryInfo & /*query_info*/,
ContextPtr context,
QueryProcessingStage::Enum /*processed_stage*/,
size_t max_block_size,
size_t /*num_streams*/)
{
storage_snapshot->check(column_names);
Block source_sample_block = storage_snapshot->getSampleBlockForColumns(column_names);
RWLockImpl::LockHolder holder = tryLockTimedWithContext(rwlock, RWLockImpl::Read, context);
return Pipe(std::make_shared<JoinSource>(join, std::move(holder), max_block_size, source_sample_block));
}
}