From dd3ea8d31b1b5615bcdd6b671f5e6e86315e8dda Mon Sep 17 00:00:00 2001 From: liyang830 Date: Fri, 10 Mar 2023 20:52:27 +0800 Subject: [PATCH 0001/1081] feat: modify materalized view query, check inner table structure --- src/Storages/StorageMaterializedView.cpp | 13 +++++++++++++ ...erialized_view_query_has_inner_table.reference | 2 ++ ...er_materialized_view_query_has_inner_table.sql | 15 +++++++++++++++ 3 files changed, 30 insertions(+) create mode 100644 tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.reference create mode 100644 tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.sql diff --git a/src/Storages/StorageMaterializedView.cpp b/src/Storages/StorageMaterializedView.cpp index ae3fa62b38c..0bbd689043c 100644 --- a/src/Storages/StorageMaterializedView.cpp +++ b/src/Storages/StorageMaterializedView.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -269,6 +270,18 @@ void StorageMaterializedView::alter( DatabaseCatalog::instance().updateViewDependency(old_select.select_table_id, table_id, new_select.select_table_id, table_id); new_metadata.setSelectQuery(new_select); + + /// check materialized view inner table structure + if (has_inner_table) + { + const Block & block = InterpreterSelectWithUnionQuery::getSampleBlock(new_select.select_query, local_context); + for (const auto & col : block.getColumnsWithTypeAndName()) + { + if (!tryGetTargetTable()->getInMemoryMetadata().columns.has(col.name)) + throw Exception(ErrorCodes::QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW, "column {} is not in materialized view inner table", col.name); + } + } + } /// end modify query diff --git a/tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.reference b/tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.reference new file mode 100644 index 00000000000..1191247b6d9 --- /dev/null +++ b/tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.reference @@ -0,0 +1,2 @@ +1 +2 diff --git a/tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.sql b/tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.sql new file mode 100644 index 00000000000..73bbac59a95 --- /dev/null +++ b/tests/queries/0_stateless/25340_alter_materialized_view_query_has_inner_table.sql @@ -0,0 +1,15 @@ +DROP TABLE IF EXISTS src_table; +DROP TABLE IF EXISTS mv; + +CREATE TABLE src_table (`a` UInt32, `b` UInt32) ENGINE = MergeTree ORDER BY a; +CREATE MATERIALIZED VIEW mv UUID '2bad6d75-86fe-4da0-815b-2c7410253941' (`a` UInt32) ENGINE = MergeTree ORDER BY a AS SELECT a FROM src_table; + +INSERT INTO src_table (a, b) VALUES (1, 1), (2, 2); + +SELECT * FROM mv; + +SET allow_experimental_alter_materialized_view_structure = 1; +ALTER TABLE mv MODIFY QUERY SELECT a, b FROM src_table; -- {serverError QUERY_IS_NOT_SUPPORTED_IN_MATERIALIZED_VIEW} + +DROP TABLE src_table; +DROP TABLE mv; \ No newline at end of file From 3f5853b970dd205465a5593d5786c1f8a4d82cc7 Mon Sep 17 00:00:00 2001 From: AN Date: Fri, 27 Oct 2023 19:17:13 +0300 Subject: [PATCH 0002/1081] Update index.md MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Punctuation fixes, узел→сервер as suggested by alexei-milovidov at https://github.com/ClickHouse/ClickHouse/pull/56040#issuecomment-1783155867, консистентность → согласованность (standard translation instead of calque) --- docs/ru/index.md | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/docs/ru/index.md b/docs/ru/index.md index 78bb382753b..a9a666b18db 100644 --- a/docs/ru/index.md +++ b/docs/ru/index.md @@ -41,7 +41,7 @@ ClickHouse — столбцовая система управления база Разный порядок хранения данных лучше подходит для разных сценариев работы. Сценарий работы с данными — это то, какие производятся запросы, как часто и в каком соотношении; сколько читается данных на запросы каждого вида — строк, столбцов, байтов; как соотносятся чтения и обновления данных; какой рабочий размер данных и насколько локально он используется; используются ли транзакции и с какой изолированностью; какие требования к дублированию данных и логической целостности; требования к задержкам на выполнение и пропускной способности запросов каждого вида и т. п. -Чем больше нагрузка на систему, тем более важной становится специализация под сценарий работы, и тем более конкретной становится эта специализация. Не существует системы, одинаково хорошо подходящей под существенно различные сценарии работы. Если система подходит под широкое множество сценариев работы, то при достаточно большой нагрузке, система будет справляться со всеми сценариями работы плохо, или справляться хорошо только с одним из сценариев работы. +Чем больше нагрузка на систему, тем более важной становится специализация под сценарий работы, и тем более конкретной становится эта специализация. Не существует системы, одинаково хорошо подходящей под существенно различные сценарии работы. Если система подходит под широкое множество сценариев работы, то при достаточно большой нагрузке система будет справляться со всеми сценариями работы плохо, или справляться хорошо только с одним из сценариев работы. ## Ключевые особенности OLAP-сценария работы {#kliuchevye-osobennosti-olap-stsenariia-raboty} @@ -53,11 +53,11 @@ ClickHouse — столбцовая система управления база - запросы идут сравнительно редко (обычно не более сотни в секунду на сервер); - при выполнении простых запросов, допустимы задержки в районе 50 мс; - значения в столбцах достаточно мелкие — числа и небольшие строки (например, 60 байт на URL); -- требуется высокая пропускная способность при обработке одного запроса (до миллиардов строк в секунду на один узел); +- требуется высокая пропускная способность при обработке одного запроса (до миллиардов строк в секунду на один сервер); - транзакции отсутствуют; -- низкие требования к консистентности данных; -- в запросе одна большая таблица, все таблицы кроме одной маленькие; -- результат выполнения запроса существенно меньше исходных данных — то есть данные фильтруются или агрегируются; результат выполнения помещается в оперативную память одного узла. +- низкие требования к согласованности данных; +- в запросе одна большая таблица, все остальные таблицы из запроса — маленькие; +- результат выполнения запроса существенно меньше исходных данных — то есть данные фильтруются или агрегируются; результат выполнения помещается в оперативную память одного сервера. Легко видеть, что OLAP-сценарий работы существенно отличается от других распространённых сценариев работы (например, OLTP или Key-Value сценариев работы). Таким образом, не имеет никакого смысла пытаться использовать OLTP-системы или системы класса «ключ — значение» для обработки аналитических запросов, если вы хотите получить приличную производительность («выше плинтуса»). Например, если вы попытаетесь использовать для аналитики MongoDB или Redis — вы получите анекдотически низкую производительность по сравнению с OLAP-СУБД. @@ -77,11 +77,11 @@ ClickHouse — столбцовая система управления база ### По вводу-выводу {#po-vvodu-vyvodu} -1. Для выполнения аналитического запроса, требуется прочитать небольшое количество столбцов таблицы. В столбцовой БД для этого можно читать только нужные данные. Например, если вам требуется только 5 столбцов из 100, то следует рассчитывать на 20-кратное уменьшение ввода-вывода. -2. Так как данные читаются пачками, то их проще сжимать. Данные, лежащие по столбцам также лучше сжимаются. За счёт этого, дополнительно уменьшается объём ввода-вывода. -3. За счёт уменьшения ввода-вывода, больше данных влезает в системный кэш. +1. Для выполнения аналитического запроса требуется прочитать небольшое количество столбцов таблицы. В столбцовой БД для этого можно читать только нужные данные. Например, если вам требуется только 5 столбцов из 100, то следует рассчитывать на 20-кратное уменьшение ввода-вывода. +2. Так как данные читаются пачками, то их проще сжимать. Данные, лежащие по столбцам, также лучше сжимаются. За счёт этого, дополнительно уменьшается объём ввода-вывода. +3. За счёт уменьшения ввода-вывода больше данных влезает в системный кэш. -Например, для запроса «посчитать количество записей для каждой рекламной системы», требуется прочитать один столбец «идентификатор рекламной системы», который занимает 1 байт в несжатом виде. Если большинство переходов было не с рекламных систем, то можно рассчитывать хотя бы на десятикратное сжатие этого столбца. При использовании быстрого алгоритма сжатия, возможно разжатие данных со скоростью более нескольких гигабайт несжатых данных в секунду. То есть, такой запрос может выполняться со скоростью около нескольких миллиардов строк в секунду на одном сервере. На практике, такая скорость действительно достигается. +Например, для запроса «посчитать количество записей для каждой рекламной системы» требуется прочитать один столбец «идентификатор рекламной системы», который занимает 1 байт в несжатом виде. Если большинство переходов было не с рекламных систем, то можно рассчитывать хотя бы на десятикратное сжатие этого столбца. При использовании быстрого алгоритма сжатия возможно разжатие данных со скоростью более нескольких гигабайт несжатых данных в секунду. То есть такой запрос может выполняться со скоростью около нескольких миллиардов строк в секунду на одном сервере. На практике такая скорость действительно достигается. ### По вычислениям {#po-vychisleniiam} From 4dcbd6775a9cd1afe3c8be96e3c68c397ae547f0 Mon Sep 17 00:00:00 2001 From: Thom O'Connor Date: Fri, 3 Nov 2023 09:46:35 -0600 Subject: [PATCH 0003/1081] Update kill.md Added additional examples and context for killing queries and mutations --- docs/en/sql-reference/statements/kill.md | 64 ++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index 294724dfa50..32de7a41e72 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -21,6 +21,35 @@ The queries to terminate are selected from the system.processes table using the Examples: +First, you'll need to get the list of incomplete queries. This SQL queries provides them according to those running the longest: + +List from a single ClickHouse node: +``` sql +SELECT + initial_query_id, + query_id, + formatReadableTimeDelta(elapsed) AS time_delta, + query, + * + FROM system.processes + WHERE query ILIKE 'SELECT%' + ORDER BY time_delta DESC; +``` + +List from a ClickHouse cluster: +``` sql +SELECT + initial_query_id, + query_id, + formatReadableTimeDelta(elapsed) AS time_delta, + query, + * + FROM clusterAllReplicas(default, system.processes) + WHERE query ILIKE 'SELECT%' + ORDER BY time_delta DESC; +``` + +Kill the query: ``` sql -- Forcibly terminates all queries with the specified query_id: KILL QUERY WHERE query_id='2-857d-4a57-9ee0-327da5d60a90' @@ -44,6 +73,8 @@ A test query (`TEST`) only checks the user’s rights and displays a list of que ## KILL MUTATION +One of the first things to check if a ClickHouse system or service is not running well is for long-running, incomplete mutations. The asynchronous (background) nature of mutations can cause a large queue of them that can then consume all available resources on the service. You may need to either pause all new mutations, INSERTs, and SELECTs and allow the queue of mutations to complete, or else manually kill some of these mutations. + ``` sql KILL MUTATION [ON CLUSTER cluster] WHERE @@ -57,6 +88,39 @@ A test query (`TEST`) only checks the user’s rights and displays a list of mut Examples: +Get a count() of the number of incomplete mutations: + +Count of mutations from a single ClickHouse node: +``` sql +SELECT count(*) +FROM system.mutations +WHERE is_done = 0; +``` + +Count of mutations from a ClickHouse cluster of replicas: +``` sql +SELECT count(*) +FROM clusterAllReplicas('default',system.mutations) +WHERE is_done = 0; +``` + +Query the list of incomplete mutations: + +List of mutations from a single ClickHouse node: +``` sql +SELECT mutation_id,* +FROM system.mutations +WHERE is_done = 0; +``` + +List of mutations from a ClickHouse cluster: +``` sql +SELECT mutation_id,* +FROM clusterAllReplicas('default',system.mutations) +WHERE is_done = 0; +``` + +Kill the mutations as needed: ``` sql -- Cancel and remove all mutations of the single table: KILL MUTATION WHERE database = 'default' AND table = 'table' From 1134af19caeaffcf70cc94146faed346d6af0cf6 Mon Sep 17 00:00:00 2001 From: Justin de Guzman Date: Tue, 7 Nov 2023 22:33:29 -0800 Subject: [PATCH 0004/1081] [Docs] Fix typo Co-authored-by: Johnny <9611008+johnnymatthews@users.noreply.github.com> --- docs/en/sql-reference/statements/kill.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index 32de7a41e72..a7d050e548c 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -21,7 +21,7 @@ The queries to terminate are selected from the system.processes table using the Examples: -First, you'll need to get the list of incomplete queries. This SQL queries provides them according to those running the longest: +First, you'll need to get the list of incomplete queries. This SQL query provides them according to those running the longest: List from a single ClickHouse node: ``` sql From 7e0d95e48cb399c047c9756d81b0f76ce67ea57f Mon Sep 17 00:00:00 2001 From: Justin de Guzman Date: Tue, 7 Nov 2023 22:33:43 -0800 Subject: [PATCH 0005/1081] [Docs] Formatting Co-authored-by: Johnny <9611008+johnnymatthews@users.noreply.github.com> --- docs/en/sql-reference/statements/kill.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index a7d050e548c..57448c4f441 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -88,7 +88,7 @@ A test query (`TEST`) only checks the user’s rights and displays a list of mut Examples: -Get a count() of the number of incomplete mutations: +Get a `count()` of the number of incomplete mutations: Count of mutations from a single ClickHouse node: ``` sql From ec02a2a2c4f4d4a279732df2c2dd61ab8b0cb80a Mon Sep 17 00:00:00 2001 From: Justin de Guzman Date: Tue, 7 Nov 2023 22:36:49 -0800 Subject: [PATCH 0006/1081] [Docs] Reword for clarity Co-authored-by: Johnny <9611008+johnnymatthews@users.noreply.github.com> --- docs/en/sql-reference/statements/kill.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/docs/en/sql-reference/statements/kill.md b/docs/en/sql-reference/statements/kill.md index 57448c4f441..c85870fc0c8 100644 --- a/docs/en/sql-reference/statements/kill.md +++ b/docs/en/sql-reference/statements/kill.md @@ -73,7 +73,10 @@ A test query (`TEST`) only checks the user’s rights and displays a list of que ## KILL MUTATION -One of the first things to check if a ClickHouse system or service is not running well is for long-running, incomplete mutations. The asynchronous (background) nature of mutations can cause a large queue of them that can then consume all available resources on the service. You may need to either pause all new mutations, INSERTs, and SELECTs and allow the queue of mutations to complete, or else manually kill some of these mutations. +The presence of long-running or incomplete mutations often indicates that a ClickHouse service is running poorly. The asynchronous nature of mutations can cause them to consume all available resources on a system. You may need to either: + +- Pause all new mutations, `INSERT`s , and `SELECT`s and allow the queue of mutations to complete. +- Or manually kill some of these mutations by sending a `KILLSIG` command. ``` sql KILL MUTATION [ON CLUSTER cluster] From 039bb1d599a5262e558b9b4ebd66fd85469afa3c Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Mon, 22 Jan 2024 20:26:28 +0100 Subject: [PATCH 0007/1081] fix race on Context::async_insert_queue --- src/Interpreters/Context.cpp | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 7e89c794712..51cfd302338 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -181,6 +181,7 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int CLUSTER_DOESNT_EXIST; + extern const int ABORTED; } #define SHUTDOWN(log, desc, ptr, method) do \ @@ -556,7 +557,12 @@ struct ContextSharedPart : boost::noncopyable return; /// Need to flush the async insert queue before shutting down the database catalog - async_insert_queue.reset(); + std::shared_ptr delete_async_insert_queue; + { + std::lock_guard lock(mutex); + delete_async_insert_queue = std::move(async_insert_queue); + } + delete_async_insert_queue.reset(); /// Stop periodic reloading of the configuration files. /// This must be done first because otherwise the reloading may pass a changed config @@ -4838,11 +4844,15 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const AsynchronousInsertQueue * Context::getAsynchronousInsertQueue() const { - return shared->async_insert_queue.get(); + std::lock_guard lock(mutex); + if (auto res = shared->async_insert_queue.get()) + return res; + throw Exception(ErrorCodes::ABORTED, "AsynchronousInsertQueue is not initialized yet or has been already shutdown"); } void Context::setAsynchronousInsertQueue(const std::shared_ptr & ptr) { + std::lock_guard lock(mutex); using namespace std::chrono; if (std::chrono::milliseconds(settings.async_insert_busy_timeout_ms) == 0ms) From eb881667638524f182f06e19d699704ce9e86196 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 Jan 2024 00:28:28 +0100 Subject: [PATCH 0008/1081] Update Context.cpp --- src/Interpreters/Context.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 51cfd302338..217b247c21c 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -181,7 +181,6 @@ namespace ErrorCodes extern const int ILLEGAL_COLUMN; extern const int NUMBER_OF_COLUMNS_DOESNT_MATCH; extern const int CLUSTER_DOESNT_EXIST; - extern const int ABORTED; } #define SHUTDOWN(log, desc, ptr, method) do \ @@ -4845,9 +4844,7 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const AsynchronousInsertQueue * Context::getAsynchronousInsertQueue() const { std::lock_guard lock(mutex); - if (auto res = shared->async_insert_queue.get()) - return res; - throw Exception(ErrorCodes::ABORTED, "AsynchronousInsertQueue is not initialized yet or has been already shutdown"); + return shared->async_insert_queue.get(); } void Context::setAsynchronousInsertQueue(const std::shared_ptr & ptr) From 4cfc8d1a34342d44adbc7d9c8c3a4916670d68b2 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Tue, 23 Jan 2024 00:30:42 +0100 Subject: [PATCH 0009/1081] better method name --- src/Interpreters/Context.cpp | 2 +- src/Interpreters/Context.h | 2 +- src/Interpreters/InterpreterSystemQuery.cpp | 2 +- src/Interpreters/executeQuery.cpp | 2 +- src/Server/TCPHandler.cpp | 2 +- src/Storages/System/StorageSystemAsynchronousInserts.cpp | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp index 217b247c21c..ab42e6b0ec9 100644 --- a/src/Interpreters/Context.cpp +++ b/src/Interpreters/Context.cpp @@ -4841,7 +4841,7 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const return ignored_part_uuids; } -AsynchronousInsertQueue * Context::getAsynchronousInsertQueue() const +AsynchronousInsertQueue * Context::tryGetAsynchronousInsertQueue() const { std::lock_guard lock(mutex); return shared->async_insert_queue.get(); diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index 65566876a80..6180bfbde88 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -1203,7 +1203,7 @@ public: PartUUIDsPtr getPartUUIDs() const; PartUUIDsPtr getIgnoredPartUUIDs() const; - AsynchronousInsertQueue * getAsynchronousInsertQueue() const; + AsynchronousInsertQueue * tryGetAsynchronousInsertQueue() const; void setAsynchronousInsertQueue(const std::shared_ptr & ptr); ReadTaskCallback getReadTaskCallback() const; diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp index 1712c9608bf..f478b43049f 100644 --- a/src/Interpreters/InterpreterSystemQuery.cpp +++ b/src/Interpreters/InterpreterSystemQuery.cpp @@ -692,7 +692,7 @@ BlockIO InterpreterSystemQuery::execute() case Type::FLUSH_ASYNC_INSERT_QUEUE: { getContext()->checkAccess(AccessType::SYSTEM_FLUSH_ASYNC_INSERT_QUEUE); - auto * queue = getContext()->getAsynchronousInsertQueue(); + auto * queue = getContext()->tryGetAsynchronousInsertQueue(); if (!queue) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot flush asynchronous insert queue because it is not initialized"); diff --git a/src/Interpreters/executeQuery.cpp b/src/Interpreters/executeQuery.cpp index 4b5a6a84e17..a84c957d9a8 100644 --- a/src/Interpreters/executeQuery.cpp +++ b/src/Interpreters/executeQuery.cpp @@ -924,7 +924,7 @@ static std::tuple executeQueryImpl( std::unique_ptr interpreter; bool async_insert = false; - auto * queue = context->getAsynchronousInsertQueue(); + auto * queue = context->tryGetAsynchronousInsertQueue(); auto * logger = &Poco::Logger::get("executeQuery"); if (insert_query && async_insert_enabled) diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index fa7206eeaac..9bc6c3872fd 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -907,7 +907,7 @@ void TCPHandler::processInsertQuery() Block processed_block; const auto & settings = query_context->getSettingsRef(); - auto * insert_queue = query_context->getAsynchronousInsertQueue(); + auto * insert_queue = query_context->tryGetAsynchronousInsertQueue(); const auto & insert_query = assert_cast(*state.parsed_query); bool async_insert_enabled = settings.async_insert; diff --git a/src/Storages/System/StorageSystemAsynchronousInserts.cpp b/src/Storages/System/StorageSystemAsynchronousInserts.cpp index 20ba4d1cdfb..b480821f8ea 100644 --- a/src/Storages/System/StorageSystemAsynchronousInserts.cpp +++ b/src/Storages/System/StorageSystemAsynchronousInserts.cpp @@ -34,7 +34,7 @@ void StorageSystemAsynchronousInserts::fillData(MutableColumns & res_columns, Co { using namespace std::chrono; - auto * insert_queue = context->getAsynchronousInsertQueue(); + auto * insert_queue = context->tryGetAsynchronousInsertQueue(); if (!insert_queue) return; From f91feb0dcb405df80f317f456372c7374f2c75ee Mon Sep 17 00:00:00 2001 From: Daniil Ivanik Date: Tue, 30 Jan 2024 14:17:11 +0100 Subject: [PATCH 0010/1081] Initial working commit --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 210 ++++++++++-------- src/Storages/SelectQueryInfo.h | 2 +- src/Storages/System/StorageSystemNumbers.cpp | 8 +- src/Storages/System/StorageSystemNumbers.h | 36 +-- src/Storages/System/attachSystemTables.cpp | 5 +- src/TableFunctions/CMakeLists.txt | 2 +- src/TableFunctions/ITableFunction.cpp | 2 +- src/TableFunctions/TableFunctionNumbers.cpp | 2 +- .../TableFunctionsGenerateSeries.cpp | 100 +++++++++ src/TableFunctions/registerTableFunctions.cpp | 1 + src/TableFunctions/registerTableFunctions.h | 1 + 11 files changed, 234 insertions(+), 135 deletions(-) create mode 100644 src/TableFunctions/TableFunctionsGenerateSeries.cpp diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 5173b18c6bf..13a14ffb917 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -26,41 +26,59 @@ namespace class NumbersSource : public ISource { public: - NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 step_) - : ISource(createHeader()), block_size(block_size_), next(offset_), step(step_) + NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 step_, const std::string& column_name, UInt64 inner_step_) + : ISource(createHeader(column_name)), block_size(block_size_), next(offset_), step(step_), inner_step(inner_step_), inner_remainder(offset_ % inner_step_) { } String getName() const override { return "Numbers"; } - static Block createHeader() { return {ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), "number")}; } + static Block createHeader(const std::string& column_name) { return {ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)}; } protected: Chunk generate() override { - auto column = ColumnUInt64::create(block_size); - ColumnUInt64::Container & vec = column->getData(); UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. + UInt64 first_element = (curr / inner_step) * inner_step + inner_remainder; + if (first_element < curr) { + first_element += inner_step; + } + UInt64 filtered_block_size = 0; + if (first_element - curr >= block_size) { + auto column = ColumnUInt64::create(0); + return {Columns{std::move(column)}, filtered_block_size}; + } + if (first_element - curr < block_size) { + filtered_block_size = (block_size - (first_element - curr) - 1) / inner_step + 1; + } + + auto column = ColumnUInt64::create(filtered_block_size); + ColumnUInt64::Container & vec = column->getData(); UInt64 * pos = vec.data(); /// This also accelerates the code. - UInt64 * end = &vec[block_size]; - iota(pos, static_cast(end - pos), curr); + UInt64 * end = &vec[filtered_block_size]; + iota(pos, static_cast(end - pos), UInt64{0}); + for (UInt64 p = 0; p < filtered_block_size; p += 1) { + vec[p] = vec[p] * inner_step + first_element; + } next += step; progress(column->size(), column->byteSize()); - return {Columns{std::move(column)}, block_size}; + return {Columns{std::move(column)}, filtered_block_size}; } private: UInt64 block_size; UInt64 next; UInt64 step; + UInt64 inner_step; + UInt64 inner_remainder; }; -UInt128 sizeOfRange(const Range & r) +[[maybe_unused]] UInt128 sizeOfRange(const Range & r) { UInt128 size; if (r.right.isPositiveInfinity()) @@ -77,7 +95,7 @@ UInt128 sizeOfRange(const Range & r) return size; }; -auto sizeOfRanges(const Ranges & rs) +[[maybe_unused]] auto sizeOfRanges(const Ranges & rs) { UInt128 total_size{}; for (const Range & r : rs) @@ -91,7 +109,7 @@ auto sizeOfRanges(const Ranges & rs) /// Generate numbers according to ranges. /// Numbers generated is ordered in one stream. /// Notice that we will not generate additional numbers out of ranges. -class NumbersRangedSource : public ISource +class [[maybe_unused]] NumbersRangedSource : public ISource { public: /// Represent a position in Ranges list. @@ -109,8 +127,8 @@ public: using RangesStatePtr = std::shared_ptr; - NumbersRangedSource(const Ranges & ranges_, RangesStatePtr & ranges_state_, UInt64 base_block_size_) - : ISource(NumbersSource::createHeader()), ranges(ranges_), ranges_state(ranges_state_), base_block_size(base_block_size_) + [[maybe_unused]] NumbersRangedSource(const Ranges & ranges_, RangesStatePtr & ranges_state_, UInt64 base_block_size_, const std::string& column_name) + : ISource(NumbersSource::createHeader(column_name)), ranges(ranges_), ranges_state(ranges_state_), base_block_size(base_block_size_) { } @@ -273,7 +291,7 @@ private: namespace { /// Whether we should push limit down to scan. -bool shouldPushdownLimit(SelectQueryInfo & query_info, UInt64 limit_length) +[[maybe_unused]] bool shouldPushdownLimit(SelectQueryInfo & query_info, UInt64 limit_length) { const auto & query = query_info.query->as(); /// Just ignore some minor cases, such as: @@ -286,7 +304,7 @@ bool shouldPushdownLimit(SelectQueryInfo & query_info, UInt64 limit_length) /// Shrink ranges to size. /// For example: ranges: [1, 5], [8, 100]; size: 7, we will get [1, 5], [8, 9] -void shrinkRanges(Ranges & ranges, size_t size) +[[maybe_unused]] void shrinkRanges(Ranges & ranges, size_t size) { size_t last_range_idx = 0; for (size_t i = 0; i < ranges.size(); i++) @@ -375,107 +393,107 @@ Pipe ReadFromSystemNumbersStep::makePipe() num_streams = 1; /// Build rpn of query filters - KeyCondition condition(buildFilterDAG(), context, column_names, key_expression); + // KeyCondition condition(buildFilterDAG(), context, column_names, key_expression); Pipe pipe; Ranges ranges; - if (condition.extractPlainRanges(ranges)) - { - /// Intersect ranges with table range - std::optional table_range; - std::optional overflowed_table_range; + // if (condition.extractPlainRanges(ranges)) + // { + // /// Intersect ranges with table range + // std::optional table_range; + // std::optional overflowed_table_range; - if (numbers_storage.limit.has_value()) - { - if (std::numeric_limits::max() - numbers_storage.offset >= *(numbers_storage.limit)) - { - table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(numbers_storage.offset + *(numbers_storage.limit)), false); - } - /// UInt64 overflow, for example: SELECT number FROM numbers(18446744073709551614, 5) - else - { - table_range.emplace(FieldRef(numbers_storage.offset), true, std::numeric_limits::max(), true); - auto overflow_end = UInt128(numbers_storage.offset) + UInt128(*numbers_storage.limit); - overflowed_table_range.emplace( - FieldRef(UInt64(0)), true, FieldRef(UInt64(overflow_end - std::numeric_limits::max() - 1)), false); - } - } - else - { - table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(std::numeric_limits::max()), true); - } + // if (numbers_storage.limit.has_value()) + // { + // if (std::numeric_limits::max() - numbers_storage.offset >= *(numbers_storage.limit)) + // { + // table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(numbers_storage.offset + *(numbers_storage.limit)), false); + // } + // /// UInt64 overflow, for example: SELECT number FROM numbers(18446744073709551614, 5) + // else + // { + // table_range.emplace(FieldRef(numbers_storage.offset), true, std::numeric_limits::max(), true); + // auto overflow_end = UInt128(numbers_storage.offset) + UInt128(*numbers_storage.limit); + // overflowed_table_range.emplace( + // FieldRef(UInt64(0)), true, FieldRef(UInt64(overflow_end - std::numeric_limits::max() - 1)), false); + // } + // } + // else + // { + // table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(std::numeric_limits::max()), true); + // } - Ranges intersected_ranges; - for (auto & r : ranges) - { - auto intersected_range = table_range->intersectWith(r); - if (intersected_range) - intersected_ranges.push_back(*intersected_range); - } - /// intersection with overflowed_table_range goes back. - if (overflowed_table_range.has_value()) - { - for (auto & r : ranges) - { - auto intersected_range = overflowed_table_range->intersectWith(r); - if (intersected_range) - intersected_ranges.push_back(*overflowed_table_range); - } - } + // Ranges intersected_ranges; + // for (auto & r : ranges) + // { + // auto intersected_range = table_range->intersectWith(r); + // if (intersected_range) + // intersected_ranges.push_back(*intersected_range); + // } + // /// intersection with overflowed_table_range goes back. + // if (overflowed_table_range.has_value()) + // { + // for (auto & r : ranges) + // { + // auto intersected_range = overflowed_table_range->intersectWith(r); + // if (intersected_range) + // intersected_ranges.push_back(*overflowed_table_range); + // } + // } - /// ranges is blank, return a source who has no data - if (intersected_ranges.empty()) - { - pipe.addSource(std::make_shared(NumbersSource::createHeader())); - return pipe; - } - const auto & limit_length = limit_length_and_offset.first; - const auto & limit_offset = limit_length_and_offset.second; + // /// ranges is blank, return a source who has no data + // if (intersected_ranges.empty()) + // { + // pipe.addSource(std::make_shared(NumbersSource::createHeader(numbers_storage.column_name))); + // return pipe; + // } + // const auto & limit_length = limit_length_and_offset.first; + // const auto & limit_offset = limit_length_and_offset.second; - /// If intersected ranges is limited or we can pushdown limit. - if (!intersected_ranges.rbegin()->right.isPositiveInfinity() || should_pushdown_limit) - { - UInt128 total_size = sizeOfRanges(intersected_ranges); - UInt128 query_limit = limit_length + limit_offset; + // /// If intersected ranges is limited or we can pushdown limit. + // if (!intersected_ranges.rbegin()->right.isPositiveInfinity() || should_pushdown_limit) + // { + // UInt128 total_size = sizeOfRanges(intersected_ranges); + // UInt128 query_limit = limit_length + limit_offset; - /// limit total_size by query_limit - if (should_pushdown_limit && query_limit < total_size) - { - total_size = query_limit; - /// We should shrink intersected_ranges for case: - /// intersected_ranges: [1, 4], [7, 100]; query_limit: 2 - shrinkRanges(intersected_ranges, total_size); - } + // /// limit total_size by query_limit + // if (should_pushdown_limit && query_limit < total_size) + // { + // total_size = query_limit; + // /// We should shrink intersected_ranges for case: + // /// intersected_ranges: [1, 4], [7, 100]; query_limit: 2 + // shrinkRanges(intersected_ranges, total_size); + // } - checkLimits(size_t(total_size)); + // checkLimits(size_t(total_size)); - if (total_size / max_block_size < num_streams) - num_streams = static_cast(total_size / max_block_size); + // if (total_size / max_block_size < num_streams) + // num_streams = static_cast(total_size / max_block_size); - if (num_streams == 0) - num_streams = 1; + // if (num_streams == 0) + // num_streams = 1; - /// Ranges state, all streams will share the state. - auto ranges_state = std::make_shared(); - for (size_t i = 0; i < num_streams; ++i) - { - auto source = std::make_shared(intersected_ranges, ranges_state, max_block_size); + // /// Ranges state, all streams will share the state. + // auto ranges_state = std::make_shared(); + // for (size_t i = 0; i < num_streams; ++i) + // { + // auto source = std::make_shared(intersected_ranges, ranges_state, max_block_size, numbers_storage.column_name); - if (i == 0) - source->addTotalRowsApprox(total_size); + // if (i == 0) + // source->addTotalRowsApprox(total_size); - pipe.addSource(std::move(source)); - } - return pipe; - } - } + // pipe.addSource(std::move(source)); + // } + // return pipe; + // } + // } /// Fall back to NumbersSource for (size_t i = 0; i < num_streams; ++i) { auto source - = std::make_shared(max_block_size, numbers_storage.offset + i * max_block_size, num_streams * max_block_size); + = std::make_shared(max_block_size, numbers_storage.offset + i * max_block_size, num_streams * max_block_size, numbers_storage.column_name, numbers_storage.step); if (numbers_storage.limit && i == 0) { diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 662a5c0ef5a..2b4afaa6345 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -229,4 +229,4 @@ struct SelectQueryInfo bool isFinal() const; }; -} + } diff --git a/src/Storages/System/StorageSystemNumbers.cpp b/src/Storages/System/StorageSystemNumbers.cpp index b100be7cdf4..cd7207917a9 100644 --- a/src/Storages/System/StorageSystemNumbers.cpp +++ b/src/Storages/System/StorageSystemNumbers.cpp @@ -11,15 +11,16 @@ #include #include #include +#include namespace DB { -StorageSystemNumbers::StorageSystemNumbers(const StorageID & table_id, bool multithreaded_, std::optional limit_, UInt64 offset_) - : IStorage(table_id), multithreaded(multithreaded_), limit(limit_), offset(offset_) +StorageSystemNumbers::StorageSystemNumbers(const StorageID & table_id, bool multithreaded_, const std::string& column_name_, std::optional limit_, UInt64 offset_, UInt64 step_) + : IStorage(table_id), multithreaded(multithreaded_), limit(limit_), offset(offset_), column_name(column_name_), step(step_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(ColumnsDescription({{"number", std::make_shared()}})); + storage_metadata.setColumns(ColumnsDescription({{column_name_, std::make_shared()}})); setInMemoryMetadata(storage_metadata); } @@ -33,6 +34,7 @@ void StorageSystemNumbers::read( size_t max_block_size, size_t num_streams) { + // LOG_DEBUG(&Poco::Logger::get("Reading from SystemNumbers"), "Limit : {}", limit.value()); query_plan.addStep(std::make_unique( column_names, shared_from_this(), storage_snapshot, query_info, std::move(context), max_block_size, num_streams)); } diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index fe6227db406..ffe87b8ad14 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -10,39 +10,11 @@ namespace DB class Context; - -/** Implements a table engine for the system table "numbers". - * The table contains the only column number UInt64. - * From this table, you can read all natural numbers, starting from 0 (to 2^64 - 1, and then again). - * - * You could also specify a limit (how many numbers to give). - * - * How to generate numbers? - * - * 1. First try a smart fashion: - * - * In this fashion we try to push filters and limit down to scanning. - * Firstly extract plain ranges(no overlapping and ordered) by filter expressions. - * - * For example: - * where (numbers > 1 and numbers < 3) or (numbers in (4, 6)) or (numbers > 7 and numbers < 9) - * - * We will get ranges - * (1, 3), [4, 4], [6, 6], (7, 9) - * - * Then split the ranges evenly to one or multi-streams. With this way we will get result without large scanning. - * - * 2. If fail to extract plain ranges, fall back to ordinary scanning. - * - * If multithreaded is specified, numbers will be generated in several streams - * (and result could be out of order). If both multithreaded and limit are specified, - * the table could give you not exactly 1..limit range, but some arbitrary 'limit' numbers. - */ -class StorageSystemNumbers final : public IStorage +class StorageSystemNumbers final : public IStorage { public: /// Otherwise, streams concurrently increment atomic. - StorageSystemNumbers(const StorageID & table_id, bool multithreaded_, std::optional limit_ = std::nullopt, UInt64 offset_ = 0); + StorageSystemNumbers(const StorageID & table_id, bool multithreaded_, const std::string& column_name, std::optional limit_ = std::nullopt, UInt64 offset_ = 0, UInt64 step_ = 1); std::string getName() const override { return "SystemNumbers"; } @@ -67,6 +39,10 @@ private: bool multithreaded; std::optional limit; UInt64 offset; + std::string column_name; + + UInt64 step; + }; } diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index bf898f57833..ddd89709b6a 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -118,8 +118,9 @@ namespace DB void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, bool has_zookeeper) { attach(context, system_database, "one", "This table contains a single row with a single dummy UInt8 column containing the value 0. Used when the table is not specified explicitly, for example in queries like `SELECT 1`."); - attach(context, system_database, "numbers", "Generates all natural numbers, starting from 0 (to 2^64 - 1, and then again) in sorted order.", false); - attach(context, system_database, "numbers_mt", "Multithreaded version of `system.numbers`. Numbers order is not guaranteed.", true); + attach(context, system_database, "numbers", "Generates all natural numbers, starting from 0 (to 2^64 - 1, and then again) in sorted order.", false, "number"); + attach(context, system_database, "numbers_mt", "Multithreaded version of `system.numbers`. Numbers order is not guaranteed.", true, "number"); + // attach(context, system_database, "generate_series", "Multithreaded version of `system.numbers`. Numbers order is not guaranteed.", false, "generate_series"); attach(context, system_database, "zeros", "Produces unlimited number of non-materialized zeros.", false); attach(context, system_database, "zeros_mt", "Multithreaded version of system.zeros.", true); attach(context, system_database, "databases", "Lists all databases of the current server."); diff --git a/src/TableFunctions/CMakeLists.txt b/src/TableFunctions/CMakeLists.txt index 770990cc405..c5c2a660935 100644 --- a/src/TableFunctions/CMakeLists.txt +++ b/src/TableFunctions/CMakeLists.txt @@ -29,7 +29,7 @@ if (TARGET ch_contrib::azure_sdk) target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::azure_sdk) endif () -if (TARGET ch_contrib::simdjson) +if (TARGET ch_co`trib::simdjson) target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::simdjson) endif () diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index 137e1dc27fe..c854b6b0f9c 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -5,7 +5,7 @@ #include #include #include - +#include namespace ProfileEvents { diff --git a/src/TableFunctions/TableFunctionNumbers.cpp b/src/TableFunctions/TableFunctionNumbers.cpp index 262018f108c..71a9ba097c6 100644 --- a/src/TableFunctions/TableFunctionNumbers.cpp +++ b/src/TableFunctions/TableFunctionNumbers.cpp @@ -63,7 +63,7 @@ StoragePtr TableFunctionNumbers::executeImpl(const ASTPtr & ast_f UInt64 offset = arguments.size() == 2 ? evaluateArgument(context, arguments[0]) : 0; UInt64 length = arguments.size() == 2 ? evaluateArgument(context, arguments[1]) : evaluateArgument(context, arguments[0]); - auto res = std::make_shared(StorageID(getDatabaseName(), table_name), multithreaded, length, offset); + auto res = std::make_shared(StorageID(getDatabaseName(), table_name), multithreaded, std::string{"number"}, length, offset); res->startup(); return res; } diff --git a/src/TableFunctions/TableFunctionsGenerateSeries.cpp b/src/TableFunctions/TableFunctionsGenerateSeries.cpp new file mode 100644 index 00000000000..3941f1eadb2 --- /dev/null +++ b/src/TableFunctions/TableFunctionsGenerateSeries.cpp @@ -0,0 +1,100 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "registerTableFunctions.h" + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + +namespace +{ + +/* numbers(limit), numbers_mt(limit) + * - the same as SELECT number FROM system.numbers LIMIT limit. + * Used for testing purposes, as a simple example of table function. + */ +class TableFunctionGenerateSeries : public ITableFunction +{ +public: + static constexpr auto name = "generate_series"; + std::string getName() const override { return name; } + bool hasStaticStructure() const override { return true; } +private: + StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; + const char * getStorageTypeName() const override { return "SystemNumbers"; } + + UInt64 evaluateArgument(ContextPtr context, ASTPtr & argument) const; + + ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; +}; + +ColumnsDescription TableFunctionGenerateSeries::getActualTableStructure(ContextPtr /*context*/, bool /*is_insert_query*/) const +{ + /// NOTE: https://bugs.llvm.org/show_bug.cgi?id=47418 + return ColumnsDescription{{{"generate_series", std::make_shared()}}}; +} + +StoragePtr TableFunctionGenerateSeries::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const +{ + if (const auto * function = ast_function->as()) + { + auto arguments = function->arguments->children; + + if (arguments.size() != 2 && arguments.size() != 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 'length' or 'offset, length'.", getName()); + + UInt64 start = evaluateArgument(context, arguments[0]); + UInt64 stop = evaluateArgument(context, arguments[1]); + UInt64 interval = (arguments.size() == 3) ? evaluateArgument(context, arguments[2]) : UInt64{1}; + if (start > stop) { + auto res = std::make_shared(StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, 0); + res->startup(); + return res; + } + + auto res = std::make_shared(StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, (stop - start) / interval + 1, start, interval); + res->startup(); + return res; + } + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 'limit' or 'offset, limit'.", getName()); +} + +UInt64 TableFunctionGenerateSeries::evaluateArgument(ContextPtr context, ASTPtr & argument) const +{ + const auto & [field, type] = evaluateConstantExpression(argument, context); + + if (!isNativeNumber(type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} expression, must be numeric type", type->getName()); + + Field converted = convertFieldToType(field, DataTypeUInt64()); + if (converted.isNull()) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value {} is not representable as UInt64", + applyVisitor(FieldVisitorToString(), field)); + + return converted.safeGet(); +} + + +} + +void registerTableFunctionGenerateSeries(TableFunctionFactory & factory) +{ + factory.registerFunction({.documentation = {}, .allow_readonly = true}); + // factory.registerFunction({.documentation = {}, .allow_readonly = true}); +} + +} diff --git a/src/TableFunctions/registerTableFunctions.cpp b/src/TableFunctions/registerTableFunctions.cpp index 8c18c298f45..1631fa8e879 100644 --- a/src/TableFunctions/registerTableFunctions.cpp +++ b/src/TableFunctions/registerTableFunctions.cpp @@ -11,6 +11,7 @@ void registerTableFunctions() registerTableFunctionMerge(factory); registerTableFunctionRemote(factory); registerTableFunctionNumbers(factory); + registerTableFunctionGenerateSeries(factory); registerTableFunctionNull(factory); registerTableFunctionZeros(factory); registerTableFunctionExecutable(factory); diff --git a/src/TableFunctions/registerTableFunctions.h b/src/TableFunctions/registerTableFunctions.h index fae763e7dc8..111fbe8c22f 100644 --- a/src/TableFunctions/registerTableFunctions.h +++ b/src/TableFunctions/registerTableFunctions.h @@ -8,6 +8,7 @@ class TableFunctionFactory; void registerTableFunctionMerge(TableFunctionFactory & factory); void registerTableFunctionRemote(TableFunctionFactory & factory); void registerTableFunctionNumbers(TableFunctionFactory & factory); +void registerTableFunctionGenerateSeries(TableFunctionFactory & factory); void registerTableFunctionNull(TableFunctionFactory & factory); void registerTableFunctionZeros(TableFunctionFactory & factory); void registerTableFunctionExecutable(TableFunctionFactory & factory); From 3f0cfbd8c0816b007ff85b1a3997696ce5ed3214 Mon Sep 17 00:00:00 2001 From: Daniil Ivanik Date: Sat, 3 Feb 2024 19:46:00 +0100 Subject: [PATCH 0011/1081] Kek --- src/Common/iota.cpp | 29 ++ src/Common/iota.h | 9 + .../QueryPlan/ReadFromSystemNumbersStep.cpp | 281 ++++++++++-------- 3 files changed, 197 insertions(+), 122 deletions(-) diff --git a/src/Common/iota.cpp b/src/Common/iota.cpp index 98f18eb195b..532c4bde76d 100644 --- a/src/Common/iota.cpp +++ b/src/Common/iota.cpp @@ -27,10 +27,39 @@ void iota(T * begin, size_t count, T first_value) return iotaImpl(begin, count, first_value); } +MULTITARGET_FUNCTION_AVX2_SSE42( + MULTITARGET_FUNCTION_HEADER(template void NO_INLINE), + iotaWithStepImpl, MULTITARGET_FUNCTION_BODY((T * begin, size_t count, T first_value, T step) /// NOLINT + { + for (size_t i = 0; i < count; i++) + *(begin + i) = static_cast(first_value + i * step); + }) +) + +template +void iota_with_step(T * begin, size_t count, T first_value, T step) +{ +#if USE_MULTITARGET_CODE + if (isArchSupported(TargetArch::AVX2)) + return iotaWithStepImplAVX2(begin, count, first_value, step); + + if (isArchSupported(TargetArch::SSE42)) + return iotaWithStepImplSSE42(begin, count, first_value, step); +#endif + return iotaWithStepImpl(begin, count, first_value, step); +} + template void iota(UInt8 * begin, size_t count, UInt8 first_value); template void iota(UInt32 * begin, size_t count, UInt32 first_value); template void iota(UInt64 * begin, size_t count, UInt64 first_value); #if defined(OS_DARWIN) template void iota(size_t * begin, size_t count, size_t first_value); #endif + +template void iota_with_step(UInt8 * begin, size_t count, UInt8 first_value, UInt8 step); +template void iota_with_step(UInt32 * begin, size_t count, UInt32 first_value, UInt32 step); +template void iota_with_step(UInt64 * begin, size_t count, UInt64 first_value, UInt64 step); +#if defined(OS_DARWIN) +extern template void iota_with_step(size_t * begin, size_t count, size_t first_value, size_t step); +#endif } diff --git a/src/Common/iota.h b/src/Common/iota.h index 7910274d15d..f40cde9d5db 100644 --- a/src/Common/iota.h +++ b/src/Common/iota.h @@ -31,4 +31,13 @@ extern template void iota(UInt64 * begin, size_t count, UInt64 first_value); #if defined(OS_DARWIN) extern template void iota(size_t * begin, size_t count, size_t first_value); #endif + +template void iota_with_step(T * begin, size_t count, T first_value, T step); + +extern template void iota_with_step(UInt8 * begin, size_t count, UInt8 first_value, UInt8 step); +extern template void iota_with_step(UInt32 * begin, size_t count, UInt32 first_value, UInt32 step); +extern template void iota_with_step(UInt64 * begin, size_t count, UInt64 first_value, UInt64 step); +#if defined(OS_DARWIN) +extern template void iota(size_t * begin, size_t count, size_t first_value, size_t step); +#endif } diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 13a14ffb917..f85473e43c3 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -12,6 +12,8 @@ #include #include +#include + namespace DB { @@ -30,9 +32,9 @@ public: : ISource(createHeader(column_name)), block_size(block_size_), next(offset_), step(step_), inner_step(inner_step_), inner_remainder(offset_ % inner_step_) { } - String getName() const override { return "Numbers"; } + static Block createHeader(const std::string& column_name) { return {ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)}; } protected: @@ -57,10 +59,7 @@ protected: ColumnUInt64::Container & vec = column->getData(); UInt64 * pos = vec.data(); /// This also accelerates the code. UInt64 * end = &vec[filtered_block_size]; - iota(pos, static_cast(end - pos), UInt64{0}); - for (UInt64 p = 0; p < filtered_block_size; p += 1) { - vec[p] = vec[p] * inner_step + first_element; - } + iota_with_step(pos, static_cast(end - pos), first_element, inner_step); next += step; @@ -77,28 +76,46 @@ private: UInt64 inner_remainder; }; - -[[maybe_unused]] UInt128 sizeOfRange(const Range & r) -{ - UInt128 size; - if (r.right.isPositiveInfinity()) - return static_cast(std::numeric_limits::max()) - r.left.get() + r.left_included; - - size = static_cast(r.right.get()) - r.left.get() + 1; - - if (!r.left_included) - size--; - - if (!r.right_included) - size--; - assert(size >= 0); - return size; +struct RangeWithStep { + Range range; + UInt64 step; }; -[[maybe_unused]] auto sizeOfRanges(const Ranges & rs) +using RangesWithStep = std::vector; + +std::optional stepped_range_from_range(const Range& r, UInt64 step, UInt64 remainder) { + UInt64 begin = (r.left.get() / step) * step; + if (begin > std::numeric_limits::max() - remainder) { + return std::nullopt; + } + begin += remainder; + while (begin <= r.left.get() - r.left_included) { + if (std::numeric_limits::max() - step < begin) { + return std::nullopt; + } + begin += step; + } + + LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); + UInt128 right_edge = (r.right.get() + r.right_included); + if (begin >= right_edge) { + return std::nullopt; + } + return std::optional{RangeWithStep{Range(begin, true, static_cast(right_edge - 1), true), step}}; +} + +[[maybe_unused]] UInt128 sizeOfRange(const RangeWithStep & r) +{ + if (r.range.right.isPositiveInfinity()) + return static_cast(std::numeric_limits::max() - r.range.left.get()) / r.step + r.range.left_included; + + return static_cast(r.range.right.get() - r.range.left.get()) / r.step + 1; +}; + +[[maybe_unused]] auto sizeOfRanges(const RangesWithStep & rs) { UInt128 total_size{}; - for (const Range & r : rs) + for (const RangeWithStep & r : rs) { /// total_size will never overflow total_size += sizeOfRange(r); @@ -127,7 +144,7 @@ public: using RangesStatePtr = std::shared_ptr; - [[maybe_unused]] NumbersRangedSource(const Ranges & ranges_, RangesStatePtr & ranges_state_, UInt64 base_block_size_, const std::string& column_name) + [[maybe_unused]] NumbersRangedSource(const RangesWithStep & ranges_, RangesStatePtr & ranges_state_, UInt64 base_block_size_, const std::string& column_name) : ISource(NumbersSource::createHeader(column_name)), ranges(ranges_), ranges_state(ranges_state_), base_block_size(base_block_size_) { } @@ -187,9 +204,9 @@ protected: if (ranges.empty()) return {}; - auto first_value = [](const Range & r) { return r.left.get() + (r.left_included ? 0 : 1); }; + auto first_value = [](const RangeWithStep & r) { return r.range.left.get() + (r.range.left_included ? 0 : 1); }; - auto last_value = [](const Range & r) { return r.right.get() - (r.right_included ? 0 : 1); }; + auto last_value = [](const RangeWithStep & r) { return r.range.right.get() - (r.range.right_included ? 0 : 1); }; /// Find the data range. /// If data left is small, shrink block size. @@ -215,31 +232,33 @@ protected: UInt128 can_provide = cursor.offset_in_ranges == end.offset_in_ranges ? end.offset_in_range - cursor.offset_in_range - : static_cast(last_value(range)) - first_value(range) + 1 - cursor.offset_in_range; + : static_cast(last_value(range) - first_value(range)) / range.step + 1 - cursor.offset_in_range; /// set value to block - auto set_value = [&pos](UInt128 & start_value, UInt128 & end_value) + auto set_value = [&pos, this](UInt128 & start_value, UInt128 & end_value) { if (end_value > std::numeric_limits::max()) { - while (start_value < end_value) - *(pos++) = start_value++; + while (start_value < end_value) { + *(pos++) = start_value; + start_value += this->step; + } } else { auto start_value_64 = static_cast(start_value); auto end_value_64 = static_cast(end_value); auto size = end_value_64 - start_value_64; - iota(pos, static_cast(size), start_value_64); + iota_with_step(pos, static_cast(size), start_value_64, step); pos += size; } }; if (can_provide > need) { - UInt64 start_value = first_value(range) + cursor.offset_in_range; + UInt64 start_value = first_value(range) + cursor.offset_in_range * step; /// end_value will never overflow - iota(pos, static_cast(need), start_value); + iota_with_step(pos, static_cast(need), start_value, step); pos += need; provided += need; @@ -248,8 +267,8 @@ protected: else if (can_provide == need) { /// to avoid UInt64 overflow - UInt128 start_value = static_cast(first_value(range)) + cursor.offset_in_range; - UInt128 end_value = start_value + need; + UInt128 start_value = static_cast(first_value(range)) + cursor.offset_in_range * step; + UInt128 end_value = start_value + need * step; set_value(start_value, end_value); provided += need; @@ -259,8 +278,8 @@ protected: else { /// to avoid UInt64 overflow - UInt128 start_value = static_cast(first_value(range)) + cursor.offset_in_range; - UInt128 end_value = start_value + can_provide; + UInt128 start_value = static_cast(first_value(range)) + cursor.offset_in_range * step; + UInt128 end_value = start_value + can_provide * step; set_value(start_value, end_value); provided += static_cast(can_provide); @@ -277,13 +296,15 @@ protected: private: /// The ranges is shared between all streams. - Ranges ranges; + RangesWithStep ranges; /// Ranges state shared between all streams, actually is the start of the ranges. RangesStatePtr ranges_state; /// Base block size, will shrink when data left is not enough. UInt64 base_block_size; + + UInt64 step; }; } @@ -304,7 +325,7 @@ namespace /// Shrink ranges to size. /// For example: ranges: [1, 5], [8, 100]; size: 7, we will get [1, 5], [8, 9] -[[maybe_unused]] void shrinkRanges(Ranges & ranges, size_t size) +[[maybe_unused]] void shrinkRanges(RangesWithStep & ranges, size_t size) { size_t last_range_idx = 0; for (size_t i = 0; i < ranges.size(); i++) @@ -323,9 +344,9 @@ namespace else { auto & range = ranges[i]; - UInt64 right = range.left.get() + static_cast(size); - range.right = Field(right); - range.right_included = !range.left_included; + UInt64 right = range.range.left.get() + static_cast(size); + range.range.right = Field(right); + range.range.right_included = !range.range.left_included; last_range_idx = i; break; } @@ -393,101 +414,117 @@ Pipe ReadFromSystemNumbersStep::makePipe() num_streams = 1; /// Build rpn of query filters - // KeyCondition condition(buildFilterDAG(), context, column_names, key_expression); + KeyCondition condition(buildFilterDAG(), context, column_names, key_expression); Pipe pipe; Ranges ranges; - // if (condition.extractPlainRanges(ranges)) - // { - // /// Intersect ranges with table range - // std::optional table_range; - // std::optional overflowed_table_range; + if (condition.extractPlainRanges(ranges)) + { + LOG_DEBUG(&Poco::Logger::get("My logger"), "Use optimization"); + /// Intersect ranges with table range + std::optional table_range; + std::optional overflowed_table_range; - // if (numbers_storage.limit.has_value()) - // { - // if (std::numeric_limits::max() - numbers_storage.offset >= *(numbers_storage.limit)) - // { - // table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(numbers_storage.offset + *(numbers_storage.limit)), false); - // } - // /// UInt64 overflow, for example: SELECT number FROM numbers(18446744073709551614, 5) - // else - // { - // table_range.emplace(FieldRef(numbers_storage.offset), true, std::numeric_limits::max(), true); - // auto overflow_end = UInt128(numbers_storage.offset) + UInt128(*numbers_storage.limit); - // overflowed_table_range.emplace( - // FieldRef(UInt64(0)), true, FieldRef(UInt64(overflow_end - std::numeric_limits::max() - 1)), false); - // } - // } - // else - // { - // table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(std::numeric_limits::max()), true); - // } + if (numbers_storage.limit.has_value()) + { + if (std::numeric_limits::max() - numbers_storage.offset >= *(numbers_storage.limit)) + { + table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(numbers_storage.offset + *(numbers_storage.limit)), false); + } + /// UInt64 overflow, for example: SELECT number FROM numbers(18446744073709551614, 5) + else + { + table_range.emplace(FieldRef(numbers_storage.offset), true, std::numeric_limits::max(), true); + auto overflow_end = UInt128(numbers_storage.offset) + UInt128(*numbers_storage.limit); + overflowed_table_range.emplace( + FieldRef(UInt64(0)), true, FieldRef(UInt64(overflow_end - std::numeric_limits::max() - 1)), false); + } + } + else + { + table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(std::numeric_limits::max()), true); + } + LOG_DEBUG(&Poco::Logger::get("My logger"), "Found table ranges"); - // Ranges intersected_ranges; - // for (auto & r : ranges) - // { - // auto intersected_range = table_range->intersectWith(r); - // if (intersected_range) - // intersected_ranges.push_back(*intersected_range); - // } - // /// intersection with overflowed_table_range goes back. - // if (overflowed_table_range.has_value()) - // { - // for (auto & r : ranges) - // { - // auto intersected_range = overflowed_table_range->intersectWith(r); - // if (intersected_range) - // intersected_ranges.push_back(*overflowed_table_range); - // } - // } + RangesWithStep intersected_ranges; + for (auto & r : ranges) + { + auto intersected_range = table_range->intersectWith(r); + if (intersected_range.has_value()) { + auto range_with_step = stepped_range_from_range(intersected_range.value(), numbers_storage.step, numbers_storage.offset % numbers_storage.step); + if (range_with_step.has_value()) { + intersected_ranges.push_back(*range_with_step); + } + } + } - // /// ranges is blank, return a source who has no data - // if (intersected_ranges.empty()) - // { - // pipe.addSource(std::make_shared(NumbersSource::createHeader(numbers_storage.column_name))); - // return pipe; - // } - // const auto & limit_length = limit_length_and_offset.first; - // const auto & limit_offset = limit_length_and_offset.second; - // /// If intersected ranges is limited or we can pushdown limit. - // if (!intersected_ranges.rbegin()->right.isPositiveInfinity() || should_pushdown_limit) - // { - // UInt128 total_size = sizeOfRanges(intersected_ranges); - // UInt128 query_limit = limit_length + limit_offset; + for (const auto& range : intersected_ranges) { + LOG_DEBUG(&Poco::Logger::get("Ranges"), "Left: {}; Right {}, LI: {}, RI: {}, Step: {}", range.range.left.get(), range.range.right.get(), range.range.left_included, range.range.right_included, range.step); + // std::cout << + } + /// intersection with overflowed_table_range goes back. + if (overflowed_table_range.has_value()) + { + for (auto & r : ranges) + { + auto intersected_range = overflowed_table_range->intersectWith(r); + if (intersected_range) { + auto range_with_step = stepped_range_from_range(intersected_range.value(), numbers_storage.step, static_cast((static_cast(numbers_storage.offset) + std::numeric_limits::max() + 1) % numbers_storage.step)); + if (range_with_step) { + intersected_ranges.push_back(*range_with_step); + } + } + } + } - // /// limit total_size by query_limit - // if (should_pushdown_limit && query_limit < total_size) - // { - // total_size = query_limit; - // /// We should shrink intersected_ranges for case: - // /// intersected_ranges: [1, 4], [7, 100]; query_limit: 2 - // shrinkRanges(intersected_ranges, total_size); - // } + /// ranges is blank, return a source who has no data + if (intersected_ranges.empty()) + { + pipe.addSource(std::make_shared(NumbersSource::createHeader(numbers_storage.column_name))); + return pipe; + } + const auto & limit_length = limit_length_and_offset.first; + const auto & limit_offset = limit_length_and_offset.second; - // checkLimits(size_t(total_size)); + /// If intersected ranges is limited or we can pushdown limit. + if (!intersected_ranges.rbegin()->range.right.isPositiveInfinity() || should_pushdown_limit) + { + UInt128 total_size = sizeOfRanges(intersected_ranges); + UInt128 query_limit = limit_length + limit_offset; - // if (total_size / max_block_size < num_streams) - // num_streams = static_cast(total_size / max_block_size); + /// limit total_size by query_limit + if (should_pushdown_limit && query_limit < total_size) + { + total_size = query_limit; + /// We should shrink intersected_ranges for case: + /// intersected_ranges: [1, 4], [7, 100]; query_limit: 2 + shrinkRanges(intersected_ranges, total_size); + } - // if (num_streams == 0) - // num_streams = 1; + checkLimits(size_t(total_size)); - // /// Ranges state, all streams will share the state. - // auto ranges_state = std::make_shared(); - // for (size_t i = 0; i < num_streams; ++i) - // { - // auto source = std::make_shared(intersected_ranges, ranges_state, max_block_size, numbers_storage.column_name); + if (total_size / max_block_size < num_streams) + num_streams = static_cast(total_size / max_block_size); - // if (i == 0) - // source->addTotalRowsApprox(total_size); + if (num_streams == 0) + num_streams = 1; - // pipe.addSource(std::move(source)); - // } - // return pipe; - // } - // } + /// Ranges state, all streams will share the state. + auto ranges_state = std::make_shared(); + for (size_t i = 0; i < num_streams; ++i) + { + auto source = std::make_shared(intersected_ranges, ranges_state, max_block_size, numbers_storage.column_name); + + if (i == 0) + source->addTotalRowsApprox(total_size); + + pipe.addSource(std::move(source)); + } + return pipe; + } + } /// Fall back to NumbersSource for (size_t i = 0; i < num_streams; ++i) From 623b42574587073845a76a5e28a502a792ee6662 Mon Sep 17 00:00:00 2001 From: divanik Date: Tue, 6 Feb 2024 21:34:09 +0000 Subject: [PATCH 0012/1081] Add feature with the right author name --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 175 +++++++++++++----- src/Storages/SelectQueryInfo.h | 2 +- src/Storages/System/StorageSystemNumbers.cpp | 8 +- src/Storages/System/StorageSystemNumbers.h | 15 +- src/Storages/System/attachSystemTables.cpp | 2 +- src/TableFunctions/CMakeLists.txt | 2 +- src/TableFunctions/ITableFunction.cpp | 2 +- ...es.cpp => TableFunctionGenerateSeries.cpp} | 55 ++++-- src/TableFunctions/TableFunctionNumbers.cpp | 45 +++-- .../02970_generate_series.reference | 28 +++ .../0_stateless/02970_generate_series.sql | 14 ++ 11 files changed, 254 insertions(+), 94 deletions(-) rename src/TableFunctions/{TableFunctionsGenerateSeries.cpp => TableFunctionGenerateSeries.cpp} (65%) create mode 100644 tests/queries/0_stateless/02970_generate_series.reference create mode 100644 tests/queries/0_stateless/02970_generate_series.sql diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index f85473e43c3..4b957778c43 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -28,32 +28,37 @@ namespace class NumbersSource : public ISource { public: - NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 step_, const std::string& column_name, UInt64 inner_step_) - : ISource(createHeader(column_name)), block_size(block_size_), next(offset_), step(step_), inner_step(inner_step_), inner_remainder(offset_ % inner_step_) + NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 step_, const std::string & column_name, UInt64 inner_step_) + : ISource(createHeader(column_name)) + , block_size(block_size_) + , next(offset_) + , step(step_) + , inner_step(inner_step_) + , inner_remainder(offset_ % inner_step_) { } String getName() const override { return "Numbers"; } - - static Block createHeader(const std::string& column_name) { return {ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)}; } + static Block createHeader(const std::string & column_name) + { + return {ColumnWithTypeAndName(ColumnUInt64::create(), std::make_shared(), column_name)}; + } protected: Chunk generate() override { - UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. UInt64 first_element = (curr / inner_step) * inner_step + inner_remainder; - if (first_element < curr) { + if (first_element < curr) first_element += inner_step; - } UInt64 filtered_block_size = 0; - if (first_element - curr >= block_size) { + if (first_element - curr >= block_size) + { auto column = ColumnUInt64::create(0); return {Columns{std::move(column)}, filtered_block_size}; } - if (first_element - curr < block_size) { + if (first_element - curr < block_size) filtered_block_size = (block_size - (first_element - curr) - 1) / inner_step + 1; - } auto column = ColumnUInt64::create(filtered_block_size); ColumnUInt64::Container & vec = column->getData(); @@ -76,32 +81,37 @@ private: UInt64 inner_remainder; }; -struct RangeWithStep { +struct RangeWithStep +{ Range range; UInt64 step; }; using RangesWithStep = std::vector; -std::optional stepped_range_from_range(const Range& r, UInt64 step, UInt64 remainder) { - UInt64 begin = (r.left.get() / step) * step; - if (begin > std::numeric_limits::max() - remainder) { +std::optional stepped_range_from_range(const Range & r, UInt64 step, UInt64 remainder) +{ + if ((r.right.get() == 0) && (!r.right_included)) + return std::nullopt; + UInt64 begin = (r.left.get() / step) * step; + if (begin > std::numeric_limits::max() - remainder) return std::nullopt; - } begin += remainder; - while (begin <= r.left.get() - r.left_included) { - if (std::numeric_limits::max() - step < begin) { + + // LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); + // LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); + while ((r.left_included <= r.left.get()) && (begin <= r.left.get() - r.left_included)) + { + if (std::numeric_limits::max() - step < begin) return std::nullopt; - } begin += step; } - LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); - UInt128 right_edge = (r.right.get() + r.right_included); - if (begin >= right_edge) { + // LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); + if ((begin >= r.right_included) && (begin - r.right_included >= r.right.get())) return std::nullopt; - } - return std::optional{RangeWithStep{Range(begin, true, static_cast(right_edge - 1), true), step}}; + UInt64 right_edge_included = r.right.get() - (1 - r.right_included); + return std::optional{RangeWithStep{Range(begin, true, right_edge_included, true), step}}; } [[maybe_unused]] UInt128 sizeOfRange(const RangeWithStep & r) @@ -144,8 +154,17 @@ public: using RangesStatePtr = std::shared_ptr; - [[maybe_unused]] NumbersRangedSource(const RangesWithStep & ranges_, RangesStatePtr & ranges_state_, UInt64 base_block_size_, const std::string& column_name) - : ISource(NumbersSource::createHeader(column_name)), ranges(ranges_), ranges_state(ranges_state_), base_block_size(base_block_size_) + [[maybe_unused]] NumbersRangedSource( + const RangesWithStep & ranges_, + RangesStatePtr & ranges_state_, + UInt64 base_block_size_, + UInt64 step_, + const std::string & column_name) + : ISource(NumbersSource::createHeader(column_name)) + , ranges(ranges_) + , ranges_state(ranges_state_) + , base_block_size(base_block_size_) + , step(step_) { } @@ -158,6 +177,7 @@ protected: { std::lock_guard lock(ranges_state->mutex); + UInt64 need = base_block_size_; UInt64 size = 0; /// how many item found. @@ -196,6 +216,10 @@ protected: } ranges_state->pos = end; + + LOG_DEBUG(&Poco::Logger::get("Range borders"), "Begin: {} {}", start.offset_in_ranges, static_cast(start.offset_in_range)); + LOG_DEBUG(&Poco::Logger::get("Range borders"), "End: {} {}", end.offset_in_ranges, static_cast(end.offset_in_range)); + return size; } @@ -234,12 +258,19 @@ protected: ? end.offset_in_range - cursor.offset_in_range : static_cast(last_value(range) - first_value(range)) / range.step + 1 - cursor.offset_in_range; + LOG_DEBUG( + &Poco::Logger::get("Generate"), + "Can Provide: {}, Block size: {}", + static_cast(can_provide), + static_cast(block_size)); + /// set value to block auto set_value = [&pos, this](UInt128 & start_value, UInt128 & end_value) { if (end_value > std::numeric_limits::max()) { - while (start_value < end_value) { + while (start_value < end_value) + { *(pos++) = start_value; start_value += this->step; } @@ -248,7 +279,9 @@ protected: { auto start_value_64 = static_cast(start_value); auto end_value_64 = static_cast(end_value); - auto size = end_value_64 - start_value_64; + auto size = (end_value_64 - start_value_64) / this->step; + LOG_DEBUG( + &Poco::Logger::get("Iota"), "Size: {}, Step: {}, Start: {}", static_cast(size), this->step, start_value_64); iota_with_step(pos, static_cast(size), start_value_64, step); pos += size; } @@ -374,7 +407,7 @@ ReadFromSystemNumbersStep::ReadFromSystemNumbersStep( , key_expression{KeyDescription::parse(column_names[0], storage_snapshot->metadata->columns, context).expression} , max_block_size{max_block_size_} , num_streams{num_streams_} - , limit_length_and_offset(InterpreterSelectQuery::getLimitLengthAndOffset(query_info.query->as(), context)) + , limit_length_and_offset(InterpreterSelectQuery::getLimitLengthAndOffset(query_info.query->as(), context)) , should_pushdown_limit(shouldPushdownLimit(query_info, limit_length_and_offset.first)) , limit(query_info.limit) , storage_limits(query_info.storage_limits) @@ -410,14 +443,28 @@ Pipe ReadFromSystemNumbersStep::makePipe() { auto & numbers_storage = storage->as(); + LOG_DEBUG( + &Poco::Logger::get("Parameters"), + "Parameters: Limit: {}, Offset: {} Step: {}", + numbers_storage.limit.value(), + numbers_storage.offset, + numbers_storage.step); + if (!numbers_storage.multithreaded) num_streams = 1; + Pipe pipe; + Ranges ranges; + + if (numbers_storage.limit.has_value() && (numbers_storage.limit.value() == 0)) + { + pipe.addSource(std::make_shared(NumbersSource::createHeader(numbers_storage.column_name))); + return pipe; + } + /// Build rpn of query filters KeyCondition condition(buildFilterDAG(), context, column_names, key_expression); - Pipe pipe; - Ranges ranges; if (condition.extractPlainRanges(ranges)) { @@ -430,14 +477,15 @@ Pipe ReadFromSystemNumbersStep::makePipe() { if (std::numeric_limits::max() - numbers_storage.offset >= *(numbers_storage.limit)) { - table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(numbers_storage.offset + *(numbers_storage.limit)), false); + table_range.emplace( + FieldRef(numbers_storage.offset), true, FieldRef(numbers_storage.offset + *(numbers_storage.limit)), false); } /// UInt64 overflow, for example: SELECT number FROM numbers(18446744073709551614, 5) else { table_range.emplace(FieldRef(numbers_storage.offset), true, std::numeric_limits::max(), true); auto overflow_end = UInt128(numbers_storage.offset) + UInt128(*numbers_storage.limit); - overflowed_table_range.emplace( + overflowed_table_range.emplace( FieldRef(UInt64(0)), true, FieldRef(UInt64(overflow_end - std::numeric_limits::max() - 1)), false); } } @@ -451,34 +499,59 @@ Pipe ReadFromSystemNumbersStep::makePipe() for (auto & r : ranges) { auto intersected_range = table_range->intersectWith(r); - if (intersected_range.has_value()) { - auto range_with_step = stepped_range_from_range(intersected_range.value(), numbers_storage.step, numbers_storage.offset % numbers_storage.step); - if (range_with_step.has_value()) { + if (intersected_range.has_value()) + { + LOG_DEBUG( + &Poco::Logger::get("Ranges"), + "Ranges: {} {} {} {}", + intersected_range->left.get(), + intersected_range->right.get(), + intersected_range->left_included, + intersected_range->right_included); + auto range_with_step = stepped_range_from_range( + intersected_range.value(), numbers_storage.step, numbers_storage.offset % numbers_storage.step); + if (range_with_step.has_value()) + { + LOG_DEBUG( + &Poco::Logger::get("Ranges With Step"), + "Ranges: {} {} {} {} {}", + range_with_step->range.left.get(), + range_with_step->range.right.get(), + range_with_step->range.left_included, + range_with_step->range.right_included, + range_with_step->step); intersected_ranges.push_back(*range_with_step); } } } - for (const auto& range : intersected_ranges) { - LOG_DEBUG(&Poco::Logger::get("Ranges"), "Left: {}; Right {}, LI: {}, RI: {}, Step: {}", range.range.left.get(), range.range.right.get(), range.range.left_included, range.range.right_included, range.step); - // std::cout << - } /// intersection with overflowed_table_range goes back. if (overflowed_table_range.has_value()) { for (auto & r : ranges) { auto intersected_range = overflowed_table_range->intersectWith(r); - if (intersected_range) { - auto range_with_step = stepped_range_from_range(intersected_range.value(), numbers_storage.step, static_cast((static_cast(numbers_storage.offset) + std::numeric_limits::max() + 1) % numbers_storage.step)); - if (range_with_step) { + if (intersected_range) + { + auto range_with_step = stepped_range_from_range( + intersected_range.value(), + numbers_storage.step, + static_cast( + (static_cast(numbers_storage.offset) + std::numeric_limits::max() + 1) + % numbers_storage.step)); + if (range_with_step) intersected_ranges.push_back(*range_with_step); - } } } } + // for (const auto& range : intersected_ranges) + // { + // LOG_DEBUG(&Poco::Logger::get("Ranges with step"), "Left: {}; Right {}, LI: {}, RI: {}, Step: {}", range.range.left.get(), range.range.right.get(), range.range.left_included, range.range.right_included, range.step); + // // std::cout << + // } + /// ranges is blank, return a source who has no data if (intersected_ranges.empty()) { @@ -492,6 +565,7 @@ Pipe ReadFromSystemNumbersStep::makePipe() if (!intersected_ranges.rbegin()->range.right.isPositiveInfinity() || should_pushdown_limit) { UInt128 total_size = sizeOfRanges(intersected_ranges); + LOG_DEBUG(&Poco::Logger::get("Total_Size"), "Total Size: {}", static_cast(total_size)); UInt128 query_limit = limit_length + limit_offset; /// limit total_size by query_limit @@ -515,7 +589,8 @@ Pipe ReadFromSystemNumbersStep::makePipe() auto ranges_state = std::make_shared(); for (size_t i = 0; i < num_streams; ++i) { - auto source = std::make_shared(intersected_ranges, ranges_state, max_block_size, numbers_storage.column_name); + auto source = std::make_shared( + intersected_ranges, ranges_state, max_block_size, numbers_storage.step, numbers_storage.column_name); if (i == 0) source->addTotalRowsApprox(total_size); @@ -529,12 +604,16 @@ Pipe ReadFromSystemNumbersStep::makePipe() /// Fall back to NumbersSource for (size_t i = 0; i < num_streams; ++i) { - auto source - = std::make_shared(max_block_size, numbers_storage.offset + i * max_block_size, num_streams * max_block_size, numbers_storage.column_name, numbers_storage.step); + auto source = std::make_shared( + max_block_size, + numbers_storage.offset + i * max_block_size, + num_streams * max_block_size, + numbers_storage.column_name, + numbers_storage.step); if (numbers_storage.limit && i == 0) { - auto rows_appr = *(numbers_storage.limit); + auto rows_appr = (*numbers_storage.limit - 1) / numbers_storage.step + 1; if (limit > 0 && limit < rows_appr) rows_appr = limit; source->addTotalRowsApprox(rows_appr); @@ -546,7 +625,7 @@ Pipe ReadFromSystemNumbersStep::makePipe() if (numbers_storage.limit) { size_t i = 0; - auto storage_limit = *(numbers_storage.limit); + auto storage_limit = (*numbers_storage.limit - 1) / numbers_storage.step + 1; /// This formula is how to split 'limit' elements to 'num_streams' chunks almost uniformly. pipe.addSimpleTransform( [&](const Block & header) diff --git a/src/Storages/SelectQueryInfo.h b/src/Storages/SelectQueryInfo.h index 2b4afaa6345..662a5c0ef5a 100644 --- a/src/Storages/SelectQueryInfo.h +++ b/src/Storages/SelectQueryInfo.h @@ -229,4 +229,4 @@ struct SelectQueryInfo bool isFinal() const; }; - } +} diff --git a/src/Storages/System/StorageSystemNumbers.cpp b/src/Storages/System/StorageSystemNumbers.cpp index cd7207917a9..4c319ec7105 100644 --- a/src/Storages/System/StorageSystemNumbers.cpp +++ b/src/Storages/System/StorageSystemNumbers.cpp @@ -16,7 +16,13 @@ namespace DB { -StorageSystemNumbers::StorageSystemNumbers(const StorageID & table_id, bool multithreaded_, const std::string& column_name_, std::optional limit_, UInt64 offset_, UInt64 step_) +StorageSystemNumbers::StorageSystemNumbers( + const StorageID & table_id, + bool multithreaded_, + const std::string & column_name_, + std::optional limit_, + UInt64 offset_, + UInt64 step_) : IStorage(table_id), multithreaded(multithreaded_), limit(limit_), offset(offset_), column_name(column_name_), step(step_) { StorageInMemoryMetadata storage_metadata; diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index ffe87b8ad14..9663ee25251 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -10,11 +10,17 @@ namespace DB class Context; -class StorageSystemNumbers final : public IStorage +class StorageSystemNumbers final : public IStorage { public: /// Otherwise, streams concurrently increment atomic. - StorageSystemNumbers(const StorageID & table_id, bool multithreaded_, const std::string& column_name, std::optional limit_ = std::nullopt, UInt64 offset_ = 0, UInt64 step_ = 1); + StorageSystemNumbers( + const StorageID & table_id, + bool multithreaded_, + const std::string & column_name, + std::optional limit_ = std::nullopt, + UInt64 offset_ = 0, + UInt64 step_ = 1); std::string getName() const override { return "SystemNumbers"; } @@ -30,7 +36,6 @@ public: bool hasEvenlyDistributedRead() const override { return true; } bool isSystemStorage() const override { return true; } - bool supportsTransactions() const override { return true; } private: @@ -38,11 +43,9 @@ private: bool multithreaded; std::optional limit; - UInt64 offset; + UInt64 offset;` std::string column_name; - UInt64 step; - }; } diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp index ddd89709b6a..9eacb07bd8d 100644 --- a/src/Storages/System/attachSystemTables.cpp +++ b/src/Storages/System/attachSystemTables.cpp @@ -120,7 +120,7 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b attach(context, system_database, "one", "This table contains a single row with a single dummy UInt8 column containing the value 0. Used when the table is not specified explicitly, for example in queries like `SELECT 1`."); attach(context, system_database, "numbers", "Generates all natural numbers, starting from 0 (to 2^64 - 1, and then again) in sorted order.", false, "number"); attach(context, system_database, "numbers_mt", "Multithreaded version of `system.numbers`. Numbers order is not guaranteed.", true, "number"); - // attach(context, system_database, "generate_series", "Multithreaded version of `system.numbers`. Numbers order is not guaranteed.", false, "generate_series"); + attach(context, system_database, "generate_series", "Generates arithmetic progression of natural numbers in sorted order in a given segment with a given step", false, "generate_series"); attach(context, system_database, "zeros", "Produces unlimited number of non-materialized zeros.", false); attach(context, system_database, "zeros_mt", "Multithreaded version of system.zeros.", true); attach(context, system_database, "databases", "Lists all databases of the current server."); diff --git a/src/TableFunctions/CMakeLists.txt b/src/TableFunctions/CMakeLists.txt index c5c2a660935..770990cc405 100644 --- a/src/TableFunctions/CMakeLists.txt +++ b/src/TableFunctions/CMakeLists.txt @@ -29,7 +29,7 @@ if (TARGET ch_contrib::azure_sdk) target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::azure_sdk) endif () -if (TARGET ch_co`trib::simdjson) +if (TARGET ch_contrib::simdjson) target_link_libraries(clickhouse_table_functions PRIVATE ch_contrib::simdjson) endif () diff --git a/src/TableFunctions/ITableFunction.cpp b/src/TableFunctions/ITableFunction.cpp index c854b6b0f9c..137e1dc27fe 100644 --- a/src/TableFunctions/ITableFunction.cpp +++ b/src/TableFunctions/ITableFunction.cpp @@ -5,7 +5,7 @@ #include #include #include -#include + namespace ProfileEvents { diff --git a/src/TableFunctions/TableFunctionsGenerateSeries.cpp b/src/TableFunctions/TableFunctionGenerateSeries.cpp similarity index 65% rename from src/TableFunctions/TableFunctionsGenerateSeries.cpp rename to src/TableFunctions/TableFunctionGenerateSeries.cpp index 3941f1eadb2..88d7b0d1a71 100644 --- a/src/TableFunctions/TableFunctionsGenerateSeries.cpp +++ b/src/TableFunctions/TableFunctionGenerateSeries.cpp @@ -1,13 +1,13 @@ +#include +#include +#include +#include +#include +#include #include #include -#include -#include #include -#include -#include -#include -#include -#include +#include #include "registerTableFunctions.h" @@ -18,6 +18,7 @@ namespace ErrorCodes { extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int INVALID_SETTING_VALUE; } namespace @@ -33,8 +34,14 @@ public: static constexpr auto name = "generate_series"; std::string getName() const override { return name; } bool hasStaticStructure() const override { return true; } + private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const override; const char * getStorageTypeName() const override { return "SystemNumbers"; } UInt64 evaluateArgument(ContextPtr context, ASTPtr & argument) const; @@ -48,25 +55,31 @@ ColumnsDescription TableFunctionGenerateSeries::getActualTableStructure(ContextP return ColumnsDescription{{{"generate_series", std::make_shared()}}}; } -StoragePtr TableFunctionGenerateSeries::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const +StoragePtr TableFunctionGenerateSeries::executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription /*cached_columns*/, + bool /*is_insert_query*/) const { if (const auto * function = ast_function->as()) { auto arguments = function->arguments->children; if (arguments.size() != 2 && arguments.size() != 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 'length' or 'offset, length'.", getName()); + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 'length' or 'offset, length'.", getName()); UInt64 start = evaluateArgument(context, arguments[0]); UInt64 stop = evaluateArgument(context, arguments[1]); - UInt64 interval = (arguments.size() == 3) ? evaluateArgument(context, arguments[2]) : UInt64{1}; - if (start > stop) { - auto res = std::make_shared(StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, 0); - res->startup(); - return res; - } - - auto res = std::make_shared(StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, (stop - start) / interval + 1, start, interval); + UInt64 step = (arguments.size() == 3) ? evaluateArgument(context, arguments[2]) : UInt64{1}; + if (step == UInt64{0}) + throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Table function '{}' requires step to be a positive number", getName()); + auto res = (start > stop) + ? std::make_shared( + StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, 0, 0, 0) + : std::make_shared( + StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, (stop - start) + 1, start, step); res->startup(); return res; } @@ -82,8 +95,10 @@ UInt64 TableFunctionGenerateSeries::evaluateArgument(ContextPtr context, ASTPtr Field converted = convertFieldToType(field, DataTypeUInt64()); if (converted.isNull()) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value {} is not representable as UInt64", - applyVisitor(FieldVisitorToString(), field)); + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The value {} is not representable as UInt64", + applyVisitor(FieldVisitorToString(), field)); return converted.safeGet(); } diff --git a/src/TableFunctions/TableFunctionNumbers.cpp b/src/TableFunctions/TableFunctionNumbers.cpp index 71a9ba097c6..bcda8dc6a5e 100644 --- a/src/TableFunctions/TableFunctionNumbers.cpp +++ b/src/TableFunctions/TableFunctionNumbers.cpp @@ -1,13 +1,13 @@ +#include +#include +#include +#include +#include +#include #include #include -#include -#include #include -#include -#include -#include -#include -#include +#include #include "registerTableFunctions.h" @@ -16,8 +16,8 @@ namespace DB namespace ErrorCodes { - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; +extern const int ILLEGAL_TYPE_OF_ARGUMENT; } namespace @@ -34,8 +34,14 @@ public: static constexpr auto name = multithreaded ? "numbers_mt" : "numbers"; std::string getName() const override { return name; } bool hasStaticStructure() const override { return true; } + private: - StoragePtr executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns, bool is_insert_query) const override; + StoragePtr executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription cached_columns, + bool is_insert_query) const override; const char * getStorageTypeName() const override { return "SystemNumbers"; } UInt64 evaluateArgument(ContextPtr context, ASTPtr & argument) const; @@ -51,19 +57,26 @@ ColumnsDescription TableFunctionNumbers::getActualTableStructure( } template -StoragePtr TableFunctionNumbers::executeImpl(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const +StoragePtr TableFunctionNumbers::executeImpl( + const ASTPtr & ast_function, + ContextPtr context, + const std::string & table_name, + ColumnsDescription /*cached_columns*/, + bool /*is_insert_query*/) const { if (const auto * function = ast_function->as()) { auto arguments = function->arguments->children; if (arguments.size() != 1 && arguments.size() != 2) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 'length' or 'offset, length'.", getName()); + throw Exception( + ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 'length' or 'offset, length'.", getName()); UInt64 offset = arguments.size() == 2 ? evaluateArgument(context, arguments[0]) : 0; UInt64 length = arguments.size() == 2 ? evaluateArgument(context, arguments[1]) : evaluateArgument(context, arguments[0]); - auto res = std::make_shared(StorageID(getDatabaseName(), table_name), multithreaded, std::string{"number"}, length, offset); + auto res = std::make_shared( + StorageID(getDatabaseName(), table_name), multithreaded, std::string{"number"}, length, offset); res->startup(); return res; } @@ -80,8 +93,10 @@ UInt64 TableFunctionNumbers::evaluateArgument(ContextPtr context, Field converted = convertFieldToType(field, DataTypeUInt64()); if (converted.isNull()) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "The value {} is not representable as UInt64", - applyVisitor(FieldVisitorToString(), field)); + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "The value {} is not representable as UInt64", + applyVisitor(FieldVisitorToString(), field)); return converted.safeGet(); } diff --git a/tests/queries/0_stateless/02970_generate_series.reference b/tests/queries/0_stateless/02970_generate_series.reference new file mode 100644 index 00000000000..9e6f1db911e --- /dev/null +++ b/tests/queries/0_stateless/02970_generate_series.reference @@ -0,0 +1,28 @@ +0 +1 +4 +8 +500000001 +50000000 +100000001 +0 +10 +13 +16 +19 +7 +17 +27 +37 +47 +57 +67 +77 +17 +22 +27 +32 +37 +42 +47 +52 diff --git a/tests/queries/0_stateless/02970_generate_series.sql b/tests/queries/0_stateless/02970_generate_series.sql new file mode 100644 index 00000000000..045f584a622 --- /dev/null +++ b/tests/queries/0_stateless/02970_generate_series.sql @@ -0,0 +1,14 @@ +SELECT count() FROM generate_series(5, 4); +SELECT count() FROM generate_series(0, 0); +SELECT count() FROM generate_series(10, 20, 3); +SELECT count() FROM generate_series(7, 77, 10); +SELECT count() FROM generate_series(0, 1000000000, 2); +SELECT count() FROM generate_series(0, 999999999, 20); +SELECT count() FROM generate_series(0, 1000000000, 2) WHERE generate_series % 5 == 0; + +SELECT * FROM generate_series(5, 4); +SELECT * FROM generate_series(0, 0); +SELECT * FROM generate_series(10, 20, 3); +SELECT * FROM generate_series(7, 77, 10); +SELECT * FROM generate_series(7, 52, 5) WHERE generate_series >= 13; + From 145e425ddd5707a5852dd3c6ac2672ccbd68e2bd Mon Sep 17 00:00:00 2001 From: divanik Date: Wed, 7 Feb 2024 15:29:45 +0000 Subject: [PATCH 0013/1081] Added Documentation --- .../table-functions/generate_series.md | 25 +++++++++ .../QueryPlan/ReadFromSystemNumbersStep.cpp | 53 ------------------- src/Storages/System/StorageSystemNumbers.h | 2 +- 3 files changed, 26 insertions(+), 54 deletions(-) create mode 100644 docs/en/sql-reference/table-functions/generate_series.md diff --git a/docs/en/sql-reference/table-functions/generate_series.md b/docs/en/sql-reference/table-functions/generate_series.md new file mode 100644 index 00000000000..de34e10ac76 --- /dev/null +++ b/docs/en/sql-reference/table-functions/generate_series.md @@ -0,0 +1,25 @@ +--- +slug: /en/sql-reference/table-functions/generate_series +sidebar_position: ? +sidebar_label: generate_series +--- + +# generate_series + +`generate_series(START, STOP)` - Returns a table with the single ‘generate_series’ column (UInt64) that contains integers from start to stop inclusively. + +`generate_series(START, STOP, STEP)` - Returns a table with the single ‘generate_series’ column (UInt64) that contains integers from start to stop inclusively with spacing between values given by STEP. + +The following queries return tables with the same content but different column names: + +``` sql +SELECT * FROM numbers(10, 5); +SELECT * FROM generate_series(10, 14); +``` + +And the following queries return tables with the same content but different column names (but the second option is more efficient): + +``` sql +SELECT * FROM numbers(10, 11) WHERE number % 3 == (10 % 3); +SELECT * FROM generate_series(10, 20, 3) ; +``` \ No newline at end of file diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 4b957778c43..3bb2e0cd69d 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -98,8 +98,6 @@ std::optional stepped_range_from_range(const Range & r, UInt64 st return std::nullopt; begin += remainder; - // LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); - // LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); while ((r.left_included <= r.left.get()) && (begin <= r.left.get() - r.left_included)) { if (std::numeric_limits::max() - step < begin) @@ -107,7 +105,6 @@ std::optional stepped_range_from_range(const Range & r, UInt64 st begin += step; } - // LOG_DEBUG(&Poco::Logger::get("stepped_range_from_range"), "Begin: {}", begin); if ((begin >= r.right_included) && (begin - r.right_included >= r.right.get())) return std::nullopt; UInt64 right_edge_included = r.right.get() - (1 - r.right_included); @@ -217,9 +214,6 @@ protected: ranges_state->pos = end; - LOG_DEBUG(&Poco::Logger::get("Range borders"), "Begin: {} {}", start.offset_in_ranges, static_cast(start.offset_in_range)); - LOG_DEBUG(&Poco::Logger::get("Range borders"), "End: {} {}", end.offset_in_ranges, static_cast(end.offset_in_range)); - return size; } @@ -258,12 +252,6 @@ protected: ? end.offset_in_range - cursor.offset_in_range : static_cast(last_value(range) - first_value(range)) / range.step + 1 - cursor.offset_in_range; - LOG_DEBUG( - &Poco::Logger::get("Generate"), - "Can Provide: {}, Block size: {}", - static_cast(can_provide), - static_cast(block_size)); - /// set value to block auto set_value = [&pos, this](UInt128 & start_value, UInt128 & end_value) { @@ -280,8 +268,6 @@ protected: auto start_value_64 = static_cast(start_value); auto end_value_64 = static_cast(end_value); auto size = (end_value_64 - start_value_64) / this->step; - LOG_DEBUG( - &Poco::Logger::get("Iota"), "Size: {}, Step: {}, Start: {}", static_cast(size), this->step, start_value_64); iota_with_step(pos, static_cast(size), start_value_64, step); pos += size; } @@ -443,13 +429,6 @@ Pipe ReadFromSystemNumbersStep::makePipe() { auto & numbers_storage = storage->as(); - LOG_DEBUG( - &Poco::Logger::get("Parameters"), - "Parameters: Limit: {}, Offset: {} Step: {}", - numbers_storage.limit.value(), - numbers_storage.offset, - numbers_storage.step); - if (!numbers_storage.multithreaded) num_streams = 1; @@ -468,7 +447,6 @@ Pipe ReadFromSystemNumbersStep::makePipe() if (condition.extractPlainRanges(ranges)) { - LOG_DEBUG(&Poco::Logger::get("My logger"), "Use optimization"); /// Intersect ranges with table range std::optional table_range; std::optional overflowed_table_range; @@ -493,36 +471,11 @@ Pipe ReadFromSystemNumbersStep::makePipe() { table_range.emplace(FieldRef(numbers_storage.offset), true, FieldRef(std::numeric_limits::max()), true); } - LOG_DEBUG(&Poco::Logger::get("My logger"), "Found table ranges"); RangesWithStep intersected_ranges; for (auto & r : ranges) { auto intersected_range = table_range->intersectWith(r); - if (intersected_range.has_value()) - { - LOG_DEBUG( - &Poco::Logger::get("Ranges"), - "Ranges: {} {} {} {}", - intersected_range->left.get(), - intersected_range->right.get(), - intersected_range->left_included, - intersected_range->right_included); - auto range_with_step = stepped_range_from_range( - intersected_range.value(), numbers_storage.step, numbers_storage.offset % numbers_storage.step); - if (range_with_step.has_value()) - { - LOG_DEBUG( - &Poco::Logger::get("Ranges With Step"), - "Ranges: {} {} {} {} {}", - range_with_step->range.left.get(), - range_with_step->range.right.get(), - range_with_step->range.left_included, - range_with_step->range.right_included, - range_with_step->step); - intersected_ranges.push_back(*range_with_step); - } - } } @@ -546,11 +499,6 @@ Pipe ReadFromSystemNumbersStep::makePipe() } } - // for (const auto& range : intersected_ranges) - // { - // LOG_DEBUG(&Poco::Logger::get("Ranges with step"), "Left: {}; Right {}, LI: {}, RI: {}, Step: {}", range.range.left.get(), range.range.right.get(), range.range.left_included, range.range.right_included, range.step); - // // std::cout << - // } /// ranges is blank, return a source who has no data if (intersected_ranges.empty()) @@ -565,7 +513,6 @@ Pipe ReadFromSystemNumbersStep::makePipe() if (!intersected_ranges.rbegin()->range.right.isPositiveInfinity() || should_pushdown_limit) { UInt128 total_size = sizeOfRanges(intersected_ranges); - LOG_DEBUG(&Poco::Logger::get("Total_Size"), "Total Size: {}", static_cast(total_size)); UInt128 query_limit = limit_length + limit_offset; /// limit total_size by query_limit diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index 9663ee25251..298721984b8 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -43,7 +43,7 @@ private: bool multithreaded; std::optional limit; - UInt64 offset;` + UInt64 offset; std::string column_name; UInt64 step; }; From 03aaedace439f5db6d9a6aaf91a1b2f978b0f6a9 Mon Sep 17 00:00:00 2001 From: divanik Date: Fri, 9 Feb 2024 12:05:01 +0000 Subject: [PATCH 0014/1081] Fix bug --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 3bb2e0cd69d..bc14547889b 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -435,6 +435,8 @@ Pipe ReadFromSystemNumbersStep::makePipe() Pipe pipe; Ranges ranges; + // LOG_DEBUG(&Poco::Logger::get("parameters"), "Parameters: {} {} {}", numbers_storage.step, numbers_storage.limit.value(), numbers_storage.offset); + if (numbers_storage.limit.has_value() && (numbers_storage.limit.value() == 0)) { pipe.addSource(std::make_shared(NumbersSource::createHeader(numbers_storage.column_name))); @@ -476,6 +478,15 @@ Pipe ReadFromSystemNumbersStep::makePipe() for (auto & r : ranges) { auto intersected_range = table_range->intersectWith(r); + if (intersected_range.has_value()) + { + auto range_with_step = stepped_range_from_range( + intersected_range.value(), numbers_storage.step, numbers_storage.offset % numbers_storage.step); + if (range_with_step.has_value()) + { + intersected_ranges.push_back(*range_with_step); + } + } } From 1b2f23247b7f115ba92b9908d224d4e78e8649f4 Mon Sep 17 00:00:00 2001 From: divanik Date: Fri, 9 Feb 2024 12:28:54 +0000 Subject: [PATCH 0015/1081] Change documentation --- docs/en/sql-reference/table-functions/generate_series.md | 2 +- src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/en/sql-reference/table-functions/generate_series.md b/docs/en/sql-reference/table-functions/generate_series.md index de34e10ac76..c5d29369627 100644 --- a/docs/en/sql-reference/table-functions/generate_series.md +++ b/docs/en/sql-reference/table-functions/generate_series.md @@ -1,6 +1,6 @@ --- slug: /en/sql-reference/table-functions/generate_series -sidebar_position: ? +sidebar_position: 146 sidebar_label: generate_series --- diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index bc14547889b..ab2f726aeb5 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -483,9 +483,7 @@ Pipe ReadFromSystemNumbersStep::makePipe() auto range_with_step = stepped_range_from_range( intersected_range.value(), numbers_storage.step, numbers_storage.offset % numbers_storage.step); if (range_with_step.has_value()) - { intersected_ranges.push_back(*range_with_step); - } } } From 79f91003538a71014eb035dca024285f2fbba7d5 Mon Sep 17 00:00:00 2001 From: divanik Date: Fri, 9 Feb 2024 14:17:25 +0000 Subject: [PATCH 0016/1081] To pull --- src/TableFunctions/TableFunctionGenerateSeries.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/TableFunctions/TableFunctionGenerateSeries.cpp b/src/TableFunctions/TableFunctionGenerateSeries.cpp index 88d7b0d1a71..65c4c4915c2 100644 --- a/src/TableFunctions/TableFunctionGenerateSeries.cpp +++ b/src/TableFunctions/TableFunctionGenerateSeries.cpp @@ -77,7 +77,7 @@ StoragePtr TableFunctionGenerateSeries::executeImpl( throw Exception(ErrorCodes::INVALID_SETTING_VALUE, "Table function '{}' requires step to be a positive number", getName()); auto res = (start > stop) ? std::make_shared( - StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, 0, 0, 0) + StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, 0, 0, 1) : std::make_shared( StorageID(getDatabaseName(), table_name), false, std::string{"generate_series"}, (stop - start) + 1, start, step); res->startup(); From f7dbcdd7e7e00d4fb6d30a02ebcb4a3befcd3190 Mon Sep 17 00:00:00 2001 From: divanik Date: Fri, 9 Feb 2024 18:12:24 +0000 Subject: [PATCH 0017/1081] Made refactoring --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 41 ++++++++++++------- src/Storages/System/StorageSystemNumbers.h | 28 +++++++++++++ 2 files changed, 54 insertions(+), 15 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 3a905a56aa1..2488fa37643 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -9,10 +9,12 @@ #include #include #include +#include #include #include #include +#include "base/types.h" namespace DB { @@ -28,13 +30,13 @@ namespace class NumbersSource : public ISource { public: - NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 step_, const std::string & column_name, UInt64 inner_step_) + NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 chunk_step_, const std::string & column_name, UInt64 step_, UInt64 remainder_) : ISource(createHeader(column_name)) , block_size(block_size_) , next(offset_) + , chunk_step(chunk_step_) , step(step_) - , inner_step(inner_step_) - , inner_remainder(offset_ % inner_step_) + , remainder(remainder_) { } String getName() const override { return "Numbers"; } @@ -48,25 +50,33 @@ protected: Chunk generate() override { UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. - UInt64 first_element = (curr / inner_step) * inner_step + inner_remainder; - if (first_element < curr) - first_element += inner_step; - UInt64 filtered_block_size = 0; + UInt64 first_element = (curr / step) * step; + if (first_element > std::numeric_limits::max() - remainder) { + auto column = ColumnUInt64::create(0); + return {Columns{std::move(column)}, 0}; + } + first_element += remainder; + if (first_element < curr) { + if (first_element > std::numeric_limits::max() - step) { + auto column = ColumnUInt64::create(0); + return {Columns{std::move(column)}, 0}; + } + first_element += step; + } if (first_element - curr >= block_size) { auto column = ColumnUInt64::create(0); - return {Columns{std::move(column)}, filtered_block_size}; + return {Columns{std::move(column)}, 0}; } - if (first_element - curr < block_size) - filtered_block_size = (block_size - (first_element - curr) - 1) / inner_step + 1; + UInt64 filtered_block_size = (block_size - (first_element - curr) - 1) / step + 1; auto column = ColumnUInt64::create(filtered_block_size); ColumnUInt64::Container & vec = column->getData(); UInt64 * pos = vec.data(); /// This also accelerates the code. UInt64 * end = &vec[filtered_block_size]; - iota_with_step(pos, static_cast(end - pos), first_element, inner_step); + iota_with_step(pos, static_cast(end - pos), first_element, step); - next += step; + next += chunk_step; progress(column->size(), column->byteSize()); @@ -76,9 +86,9 @@ protected: private: UInt64 block_size; UInt64 next; + UInt64 chunk_step; UInt64 step; - UInt64 inner_step; - UInt64 inner_remainder; + UInt64 remainder; }; struct RangeWithStep @@ -565,7 +575,8 @@ Pipe ReadFromSystemNumbersStep::makePipe() numbers_storage.offset + i * max_block_size, num_streams * max_block_size, numbers_storage.column_name, - numbers_storage.step); + numbers_storage.step, + numbers_storage.offset % numbers_storage.step); if (numbers_storage.limit && i == 0) { diff --git a/src/Storages/System/StorageSystemNumbers.h b/src/Storages/System/StorageSystemNumbers.h index 298721984b8..c698bae4393 100644 --- a/src/Storages/System/StorageSystemNumbers.h +++ b/src/Storages/System/StorageSystemNumbers.h @@ -10,6 +10,34 @@ namespace DB class Context; +/** Implements a table engine for the system table "numbers". + * The table contains the only column number UInt64. + * From this table, you can read all natural numbers, starting from 0 (to 2^64 - 1, and then again). + * + * You could also specify a limit (how many numbers to give). + * + * How to generate numbers? + * + * 1. First try a smart fashion: + * + * In this fashion we try to push filters and limit down to scanning. + * Firstly extract plain ranges(no overlapping and ordered) by filter expressions. + * + * For example: + * where (numbers > 1 and numbers < 3) or (numbers in (4, 6)) or (numbers > 7 and numbers < 9) + * + * We will get ranges + * (1, 3), [4, 4], [6, 6], (7, 9) + * + * Then split the ranges evenly to one or multi-streams. With this way we will get result without large scanning. + * + * 2. If fail to extract plain ranges, fall back to ordinary scanning. + * + * If multithreaded is specified, numbers will be generated in several streams + * (and result could be out of order). If both multithreaded and limit are specified, + * the table could give you not exactly 1..limit range, but some arbitrary 'limit' numbers. + */ + class StorageSystemNumbers final : public IStorage { public: From 696609e7d562d15cfc7a6ffa776785444a97c2e7 Mon Sep 17 00:00:00 2001 From: divanik Date: Sat, 10 Feb 2024 19:59:57 +0000 Subject: [PATCH 0018/1081] Kek --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 2488fa37643..dc6aebc69c1 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -14,6 +14,7 @@ #include #include +#include "Core/Types.h" #include "base/types.h" namespace DB @@ -51,13 +52,16 @@ protected: { UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. UInt64 first_element = (curr / step) * step; - if (first_element > std::numeric_limits::max() - remainder) { + if (first_element > std::numeric_limits::max() - remainder) + { auto column = ColumnUInt64::create(0); return {Columns{std::move(column)}, 0}; } first_element += remainder; - if (first_element < curr) { - if (first_element > std::numeric_limits::max() - step) { + if (first_element < curr) + { + if (first_element > std::numeric_limits::max() - step) + { auto column = ColumnUInt64::create(0); return {Columns{std::move(column)}, 0}; } @@ -101,6 +105,8 @@ using RangesWithStep = std::vector; std::optional stepped_range_from_range(const Range & r, UInt64 step, UInt64 remainder) { + // LOG_DEBUG(&Poco::Logger::get("Stepped from range"), + // "stepped from range"); if ((r.right.get() == 0) && (!r.right_included)) return std::nullopt; UInt64 begin = (r.left.get() / step) * step; @@ -126,7 +132,11 @@ std::optional stepped_range_from_range(const Range & r, UInt64 st if (r.range.right.isPositiveInfinity()) return static_cast(std::numeric_limits::max() - r.range.left.get()) / r.step + r.range.left_included; - return static_cast(r.range.right.get() - r.range.left.get()) / r.step + 1; + UInt128 size = static_cast(r.range.right.get() - r.range.left.get()) / r.step; + if (r.range.right_included && (r.range.right.get() % r.step == 0)) { + ++size; + } + return size; }; [[maybe_unused]] auto sizeOfRanges(const RangesWithStep & rs) @@ -173,6 +183,17 @@ public: , base_block_size(base_block_size_) , step(step_) { + // for (const auto& range_with_step : ranges_) { + // // LOG_DEBUG(&Poco::Logger::get("Ranges With Step"), + // // "Ranges: {} {} {} {} {}", + // // range_with_step.range.left.get(), + // // range_with_step.range.right.get(), + // // range_with_step.range.left_included, + // // range_with_step.range.right_included, + // // range_with_step.step); + // // LOG_DEBUG(&Poco::Logger::get("Ranges With Step"), + // // "Step: {}", step); + // } } String getName() const override { return "NumbersRange"; } @@ -241,6 +262,8 @@ protected: RangesPos start, end; auto block_size = findRanges(start, end, base_block_size); + // LOG_DEBUG(&Poco::Logger::get("Found range"), "Evth: {} {} {} {} {} {}", start.offset_in_ranges, static_cast(start.offset_in_range), end.offset_in_ranges, static_cast(end.offset_in_range), base_block_size, block_size); + if (!block_size) return {}; @@ -256,6 +279,11 @@ protected: while (block_size - provided != 0) { UInt64 need = block_size - provided; + // LOG_DEBUG(&Poco::Logger::get("Indices:"), + // "Indices: {} {}, provided: {}", + // ranges.size(), + // cursor.offset_in_ranges, + // provided); auto & range = ranges[cursor.offset_in_ranges]; UInt128 can_provide = cursor.offset_in_ranges == end.offset_in_ranges @@ -445,13 +473,15 @@ Pipe ReadFromSystemNumbersStep::makePipe() Pipe pipe; Ranges ranges; - // LOG_DEBUG(&Poco::Logger::get("parameters"), "Parameters: {} {} {}", numbers_storage.step, numbers_storage.limit.value(), numbers_storage.offset); + + // LOG_DEBUG(&Poco::Logger::get("parameters"), "Parameters: {} {} {} {}", numbers_storage.step, numbers_storage.offset, numbers_storage.limit.has_value(), numbers_storage.limit.has_value() ? numbers_storage.limit.value() : UInt64{0}); if (numbers_storage.limit.has_value() && (numbers_storage.limit.value() == 0)) { pipe.addSource(std::make_shared(NumbersSource::createHeader(numbers_storage.column_name))); return pipe; } + chassert(numbers_storage.step != UInt64{0}); /// Build rpn of query filters KeyCondition condition(buildFilterDAG(), context, column_names, key_expression); @@ -575,7 +605,7 @@ Pipe ReadFromSystemNumbersStep::makePipe() numbers_storage.offset + i * max_block_size, num_streams * max_block_size, numbers_storage.column_name, - numbers_storage.step, + numbers_storage.step, numbers_storage.offset % numbers_storage.step); if (numbers_storage.limit && i == 0) From 3ec9f3c4c89dec2f1971979d7d3ae406c1ecd938 Mon Sep 17 00:00:00 2001 From: divanik Date: Sat, 10 Feb 2024 20:06:52 +0000 Subject: [PATCH 0019/1081] Check foormattign --- src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index dc6aebc69c1..1e6b539ee2e 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -52,15 +52,15 @@ protected: { UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. UInt64 first_element = (curr / step) * step; - if (first_element > std::numeric_limits::max() - remainder) + if (first_element > std::numeric_limits::max() - remainder) { auto column = ColumnUInt64::create(0); return {Columns{std::move(column)}, 0}; } first_element += remainder; - if (first_element < curr) + if (first_element < curr) { - if (first_element > std::numeric_limits::max() - step) + if (first_element > std::numeric_limits::max() - step) { auto column = ColumnUInt64::create(0); return {Columns{std::move(column)}, 0}; @@ -133,9 +133,8 @@ std::optional stepped_range_from_range(const Range & r, UInt64 st return static_cast(std::numeric_limits::max() - r.range.left.get()) / r.step + r.range.left_included; UInt128 size = static_cast(r.range.right.get() - r.range.left.get()) / r.step; - if (r.range.right_included && (r.range.right.get() % r.step == 0)) { + if (r.range.right_included && (r.range.right.get() % r.step == 0)) ++size; - } return size; }; From d0456980991c45935fd316ca7dc2bd61cf45e5b9 Mon Sep 17 00:00:00 2001 From: divanik Date: Sat, 10 Feb 2024 23:04:52 +0000 Subject: [PATCH 0020/1081] It seems to work --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 31 ++++++------------- ...ble_functions_must_be_documented.reference | 1 + 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 1e6b539ee2e..2217e426b02 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -97,7 +97,8 @@ private: struct RangeWithStep { - Range range; + UInt64 left; + UInt64 right; UInt64 step; }; @@ -124,18 +125,12 @@ std::optional stepped_range_from_range(const Range & r, UInt64 st if ((begin >= r.right_included) && (begin - r.right_included >= r.right.get())) return std::nullopt; UInt64 right_edge_included = r.right.get() - (1 - r.right_included); - return std::optional{RangeWithStep{Range(begin, true, right_edge_included, true), step}}; + return std::optional{RangeWithStep{begin, right_edge_included, step}}; } [[maybe_unused]] UInt128 sizeOfRange(const RangeWithStep & r) { - if (r.range.right.isPositiveInfinity()) - return static_cast(std::numeric_limits::max() - r.range.left.get()) / r.step + r.range.left_included; - - UInt128 size = static_cast(r.range.right.get() - r.range.left.get()) / r.step; - if (r.range.right_included && (r.range.right.get() % r.step == 0)) - ++size; - return size; + return static_cast(r.right - r.left) / r.step + 1; }; [[maybe_unused]] auto sizeOfRanges(const RangesWithStep & rs) @@ -252,10 +247,6 @@ protected: if (ranges.empty()) return {}; - auto first_value = [](const RangeWithStep & r) { return r.range.left.get() + (r.range.left_included ? 0 : 1); }; - - auto last_value = [](const RangeWithStep & r) { return r.range.right.get() - (r.range.right_included ? 0 : 1); }; - /// Find the data range. /// If data left is small, shrink block size. RangesPos start, end; @@ -287,7 +278,7 @@ protected: UInt128 can_provide = cursor.offset_in_ranges == end.offset_in_ranges ? end.offset_in_range - cursor.offset_in_range - : static_cast(last_value(range) - first_value(range)) / range.step + 1 - cursor.offset_in_range; + : static_cast(range.right - range.left) / range.step + 1 - cursor.offset_in_range; /// set value to block auto set_value = [&pos, this](UInt128 & start_value, UInt128 & end_value) @@ -312,7 +303,7 @@ protected: if (can_provide > need) { - UInt64 start_value = first_value(range) + cursor.offset_in_range * step; + UInt64 start_value = range.left + cursor.offset_in_range * step; /// end_value will never overflow iota_with_step(pos, static_cast(need), start_value, step); pos += need; @@ -323,7 +314,7 @@ protected: else if (can_provide == need) { /// to avoid UInt64 overflow - UInt128 start_value = static_cast(first_value(range)) + cursor.offset_in_range * step; + UInt128 start_value = static_cast(range.left) + cursor.offset_in_range * step; UInt128 end_value = start_value + need * step; set_value(start_value, end_value); @@ -334,7 +325,7 @@ protected: else { /// to avoid UInt64 overflow - UInt128 start_value = static_cast(first_value(range)) + cursor.offset_in_range * step; + UInt128 start_value = static_cast(range.left) + cursor.offset_in_range * step; UInt128 end_value = start_value + can_provide * step; set_value(start_value, end_value); @@ -400,9 +391,7 @@ namespace else { auto & range = ranges[i]; - UInt64 right = range.range.left.get() + static_cast(size); - range.range.right = Field(right); - range.range.right_included = !range.range.left_included; + range.right = range.left + static_cast(size) * range.step - 1; last_range_idx = i; break; } @@ -558,7 +547,7 @@ Pipe ReadFromSystemNumbersStep::makePipe() const auto & limit_offset = limit_length_and_offset.second; /// If intersected ranges is limited or we can pushdown limit. - if (!intersected_ranges.rbegin()->range.right.isPositiveInfinity() || should_pushdown_limit) + if (should_pushdown_limit) { UInt128 total_size = sizeOfRanges(intersected_ranges); UInt128 query_limit = limit_length + limit_offset; diff --git a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference index e4040a2d371..1e4f21a6722 100644 --- a/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02414_all_new_table_functions_must_be_documented.reference @@ -4,6 +4,7 @@ dictionary executable file generateRandom +generate_series input jdbc merge From 789d3c699c77d7a39f42281d9dc0c61010471242 Mon Sep 17 00:00:00 2001 From: divanik Date: Sat, 10 Feb 2024 23:49:58 +0000 Subject: [PATCH 0021/1081] Remove bug for mt --- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 65 +++++++++---------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 2217e426b02..3656a6d31ee 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -547,42 +547,39 @@ Pipe ReadFromSystemNumbersStep::makePipe() const auto & limit_offset = limit_length_and_offset.second; /// If intersected ranges is limited or we can pushdown limit. - if (should_pushdown_limit) + UInt128 total_size = sizeOfRanges(intersected_ranges); + UInt128 query_limit = limit_length + limit_offset; + + /// limit total_size by query_limit + if (should_pushdown_limit && query_limit < total_size) { - UInt128 total_size = sizeOfRanges(intersected_ranges); - UInt128 query_limit = limit_length + limit_offset; - - /// limit total_size by query_limit - if (should_pushdown_limit && query_limit < total_size) - { - total_size = query_limit; - /// We should shrink intersected_ranges for case: - /// intersected_ranges: [1, 4], [7, 100]; query_limit: 2 - shrinkRanges(intersected_ranges, total_size); - } - - checkLimits(size_t(total_size)); - - if (total_size / max_block_size < num_streams) - num_streams = static_cast(total_size / max_block_size); - - if (num_streams == 0) - num_streams = 1; - - /// Ranges state, all streams will share the state. - auto ranges_state = std::make_shared(); - for (size_t i = 0; i < num_streams; ++i) - { - auto source = std::make_shared( - intersected_ranges, ranges_state, max_block_size, numbers_storage.step, numbers_storage.column_name); - - if (i == 0) - source->addTotalRowsApprox(total_size); - - pipe.addSource(std::move(source)); - } - return pipe; + total_size = query_limit; + /// We should shrink intersected_ranges for case: + /// intersected_ranges: [1, 4], [7, 100]; query_limit: 2 + shrinkRanges(intersected_ranges, total_size); } + + checkLimits(size_t(total_size)); + + if (total_size / max_block_size < num_streams) + num_streams = static_cast(total_size / max_block_size); + + if (num_streams == 0) + num_streams = 1; + + /// Ranges state, all streams will share the state. + auto ranges_state = std::make_shared(); + for (size_t i = 0; i < num_streams; ++i) + { + auto source = std::make_shared( + intersected_ranges, ranges_state, max_block_size, numbers_storage.step, numbers_storage.column_name); + + if (i == 0) + source->addTotalRowsApprox(total_size); + + pipe.addSource(std::move(source)); + } + return pipe; } /// Fall back to NumbersSource From 0f84f68da77663e2adcce800cceefff5ab019b58 Mon Sep 17 00:00:00 2001 From: divanik Date: Tue, 13 Feb 2024 09:59:39 +0000 Subject: [PATCH 0022/1081] Simplified code --- src/Common/iota.cpp | 2 +- src/Common/iota.h | 2 +- .../QueryPlan/ReadFromSystemNumbersStep.cpp | 65 +++++-------------- .../02970_generate_series.reference | 1 - .../0_stateless/02970_generate_series.sql | 1 - 5 files changed, 20 insertions(+), 51 deletions(-) diff --git a/src/Common/iota.cpp b/src/Common/iota.cpp index 532c4bde76d..86c9e04bb06 100644 --- a/src/Common/iota.cpp +++ b/src/Common/iota.cpp @@ -60,6 +60,6 @@ template void iota_with_step(UInt8 * begin, size_t count, UInt8 first_value, UIn template void iota_with_step(UInt32 * begin, size_t count, UInt32 first_value, UInt32 step); template void iota_with_step(UInt64 * begin, size_t count, UInt64 first_value, UInt64 step); #if defined(OS_DARWIN) -extern template void iota_with_step(size_t * begin, size_t count, size_t first_value, size_t step); +template void iota_with_step(size_t * begin, size_t count, size_t first_value, size_t step); #endif } diff --git a/src/Common/iota.h b/src/Common/iota.h index f40cde9d5db..8fa18be9769 100644 --- a/src/Common/iota.h +++ b/src/Common/iota.h @@ -38,6 +38,6 @@ extern template void iota_with_step(UInt8 * begin, size_t count, UInt8 first_val extern template void iota_with_step(UInt32 * begin, size_t count, UInt32 first_value, UInt32 step); extern template void iota_with_step(UInt64 * begin, size_t count, UInt64 first_value, UInt64 step); #if defined(OS_DARWIN) -extern template void iota(size_t * begin, size_t count, size_t first_value, size_t step); +extern template void iota_with_step(size_t * begin, size_t count, size_t first_value, size_t step); #endif } diff --git a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp index 3656a6d31ee..d69e2b6ca5a 100644 --- a/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp +++ b/src/Processors/QueryPlan/ReadFromSystemNumbersStep.cpp @@ -15,6 +15,7 @@ #include #include "Core/Types.h" +#include "base/Decimal_fwd.h" #include "base/types.h" namespace DB @@ -31,13 +32,12 @@ namespace class NumbersSource : public ISource { public: - NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 chunk_step_, const std::string & column_name, UInt64 step_, UInt64 remainder_) + NumbersSource(UInt64 block_size_, UInt64 offset_, UInt64 chunk_step_, const std::string & column_name, UInt64 step_) : ISource(createHeader(column_name)) , block_size(block_size_) , next(offset_) , chunk_step(chunk_step_) , step(step_) - , remainder(remainder_) { } String getName() const override { return "Numbers"; } @@ -50,41 +50,19 @@ public: protected: Chunk generate() override { - UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. - UInt64 first_element = (curr / step) * step; - if (first_element > std::numeric_limits::max() - remainder) - { - auto column = ColumnUInt64::create(0); - return {Columns{std::move(column)}, 0}; - } - first_element += remainder; - if (first_element < curr) - { - if (first_element > std::numeric_limits::max() - step) - { - auto column = ColumnUInt64::create(0); - return {Columns{std::move(column)}, 0}; - } - first_element += step; - } - if (first_element - curr >= block_size) - { - auto column = ColumnUInt64::create(0); - return {Columns{std::move(column)}, 0}; - } - UInt64 filtered_block_size = (block_size - (first_element - curr) - 1) / step + 1; - - auto column = ColumnUInt64::create(filtered_block_size); + auto column = ColumnUInt64::create(block_size); ColumnUInt64::Container & vec = column->getData(); + + UInt64 curr = next; /// The local variable for some reason works faster (>20%) than member of class. UInt64 * pos = vec.data(); /// This also accelerates the code. - UInt64 * end = &vec[filtered_block_size]; - iota_with_step(pos, static_cast(end - pos), first_element, step); + UInt64 * end = &vec[block_size]; + iota_with_step(pos, static_cast(end - pos), curr, step); next += chunk_step; progress(column->size(), column->byteSize()); - return {Columns{std::move(column)}, filtered_block_size}; + return {Columns{std::move(column)}, block_size}; } private: @@ -92,14 +70,13 @@ private: UInt64 next; UInt64 chunk_step; UInt64 step; - UInt64 remainder; }; struct RangeWithStep { UInt64 left; - UInt64 right; UInt64 step; + UInt128 size; }; using RangesWithStep = std::vector; @@ -125,21 +102,16 @@ std::optional stepped_range_from_range(const Range & r, UInt64 st if ((begin >= r.right_included) && (begin - r.right_included >= r.right.get())) return std::nullopt; UInt64 right_edge_included = r.right.get() - (1 - r.right_included); - return std::optional{RangeWithStep{begin, right_edge_included, step}}; + return std::optional{RangeWithStep{begin, step, static_cast(right_edge_included - begin) / step + 1}}; } -[[maybe_unused]] UInt128 sizeOfRange(const RangeWithStep & r) -{ - return static_cast(r.right - r.left) / r.step + 1; -}; - [[maybe_unused]] auto sizeOfRanges(const RangesWithStep & rs) { UInt128 total_size{}; for (const RangeWithStep & r : rs) { /// total_size will never overflow - total_size += sizeOfRange(r); + total_size += r.size; } return total_size; }; @@ -211,7 +183,7 @@ protected: while (need != 0) { UInt128 can_provide = end.offset_in_ranges == ranges.size() ? static_cast(0) - : sizeOfRange(ranges[end.offset_in_ranges]) - end.offset_in_range; + : ranges[end.offset_in_ranges].size - end.offset_in_range; if (can_provide == 0) break; @@ -278,7 +250,7 @@ protected: UInt128 can_provide = cursor.offset_in_ranges == end.offset_in_ranges ? end.offset_in_range - cursor.offset_in_range - : static_cast(range.right - range.left) / range.step + 1 - cursor.offset_in_range; + : range.size - cursor.offset_in_range; /// set value to block auto set_value = [&pos, this](UInt128 & start_value, UInt128 & end_value) @@ -377,7 +349,7 @@ namespace size_t last_range_idx = 0; for (size_t i = 0; i < ranges.size(); i++) { - auto range_size = sizeOfRange(ranges[i]); + auto range_size = ranges[i].size; if (range_size < size) { size -= static_cast(range_size); @@ -391,7 +363,7 @@ namespace else { auto & range = ranges[i]; - range.right = range.left + static_cast(size) * range.step - 1; + range.size = static_cast(size); last_range_idx = i; break; } @@ -587,11 +559,10 @@ Pipe ReadFromSystemNumbersStep::makePipe() { auto source = std::make_shared( max_block_size, - numbers_storage.offset + i * max_block_size, - num_streams * max_block_size, + numbers_storage.offset + i * max_block_size * numbers_storage.step, + num_streams * max_block_size * numbers_storage.step, numbers_storage.column_name, - numbers_storage.step, - numbers_storage.offset % numbers_storage.step); + numbers_storage.step); if (numbers_storage.limit && i == 0) { diff --git a/tests/queries/0_stateless/02970_generate_series.reference b/tests/queries/0_stateless/02970_generate_series.reference index 9e6f1db911e..4e4f556a39b 100644 --- a/tests/queries/0_stateless/02970_generate_series.reference +++ b/tests/queries/0_stateless/02970_generate_series.reference @@ -4,7 +4,6 @@ 8 500000001 50000000 -100000001 0 10 13 diff --git a/tests/queries/0_stateless/02970_generate_series.sql b/tests/queries/0_stateless/02970_generate_series.sql index 045f584a622..a7f89e1bd3f 100644 --- a/tests/queries/0_stateless/02970_generate_series.sql +++ b/tests/queries/0_stateless/02970_generate_series.sql @@ -4,7 +4,6 @@ SELECT count() FROM generate_series(10, 20, 3); SELECT count() FROM generate_series(7, 77, 10); SELECT count() FROM generate_series(0, 1000000000, 2); SELECT count() FROM generate_series(0, 999999999, 20); -SELECT count() FROM generate_series(0, 1000000000, 2) WHERE generate_series % 5 == 0; SELECT * FROM generate_series(5, 4); SELECT * FROM generate_series(0, 0); From d12ecdc5f06689d6259e2ef082a916f8b2f1836f Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 13 Feb 2024 12:35:17 +0100 Subject: [PATCH 0023/1081] Asynchronous WriteBuffer for AzureBlobStorage --- src/Backups/BackupIO_AzureBlobStorage.cpp | 3 +- src/Core/Settings.h | 3 +- .../IO/WriteBufferFromAzureBlobStorage.cpp | 116 +++++++++++------- .../IO/WriteBufferFromAzureBlobStorage.h | 22 +++- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 3 +- .../AzureBlobStorage/AzureObjectStorage.cpp | 3 +- .../AzureBlobStorage/AzureObjectStorage.h | 5 +- src/IO/WriteBufferFromS3.cpp | 2 +- src/IO/WriteBufferFromS3.h | 5 +- src/IO/WriteBufferFromS3TaskTracker.cpp | 21 ++-- src/IO/WriteBufferFromS3TaskTracker.h | 8 +- 11 files changed, 124 insertions(+), 67 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 52ce20d5108..44a72f80456 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -278,7 +278,8 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin settings->max_single_part_upload_size, settings->max_unexpected_write_error_retries, DBMS_DEFAULT_BUFFER_SIZE, - write_settings); + write_settings, + settings->max_inflight_parts_for_one_file); } void BackupWriterAzureBlobStorage::removeFile(const String & file_name) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 44badfefabb..53de245bdfc 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -80,7 +80,8 @@ class IColumn; M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ - M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \ + M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ + M(UInt64, azure_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index 905114f50e9..cbe2367823d 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -18,13 +18,21 @@ namespace ProfileEvents namespace DB { +struct WriteBufferFromAzureBlobStorage::PartData +{ + Memory<> memory; + size_t data_size = 0; +}; + WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( std::shared_ptr blob_container_client_, const String & blob_path_, size_t max_single_part_upload_size_, size_t max_unexpected_write_error_retries_, size_t buf_size_, - const WriteSettings & write_settings_) + const WriteSettings & write_settings_, + size_t max_inflight_parts_for_one_file_, + ThreadPoolCallbackRunner schedule_) : WriteBufferFromFileBase(buf_size_, nullptr, 0) , log(getLogger("WriteBufferFromAzureBlobStorage")) , max_single_part_upload_size(max_single_part_upload_size_) @@ -32,7 +40,13 @@ WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( , blob_path(blob_path_) , write_settings(write_settings_) , blob_container_client(blob_container_client_) + , task_tracker( + std::make_unique( + std::move(schedule_), + max_inflight_parts_for_one_file_, + limitedLog)) { + allocateBuffer(); } @@ -79,60 +93,80 @@ void WriteBufferFromAzureBlobStorage::finalizeImpl() { execWithRetry([this](){ next(); }, max_unexpected_write_error_retries); - if (tmp_buffer_write_offset > 0) - uploadBlock(tmp_buffer->data(), tmp_buffer_write_offset); + task_tracker->waitAll(); auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, max_unexpected_write_error_retries); - LOG_TRACE(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); -} - -void WriteBufferFromAzureBlobStorage::uploadBlock(const char * data, size_t size) -{ - auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); - const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); - - Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(data), size); - execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, max_unexpected_write_error_retries, size); - tmp_buffer_write_offset = 0; - - LOG_TRACE(log, "Staged block (id: {}) of size {} (blob path: {}).", block_id, size, blob_path); -} - -WriteBufferFromAzureBlobStorage::MemoryBufferPtr WriteBufferFromAzureBlobStorage::allocateBuffer() const -{ - return std::make_unique>(max_single_part_upload_size); + LOG_DEBUG(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); } void WriteBufferFromAzureBlobStorage::nextImpl() { - size_t size_to_upload = offset(); + task_tracker->waitIfAny(); - if (size_to_upload == 0) - return; + reallocateBuffer(); + detachBuffer(); - if (!tmp_buffer) - tmp_buffer = allocateBuffer(); - - size_t uploaded_size = 0; - while (uploaded_size != size_to_upload) + while (!detached_part_data.empty()) { - size_t memory_buffer_remaining_size = max_single_part_upload_size - tmp_buffer_write_offset; - if (memory_buffer_remaining_size == 0) - uploadBlock(tmp_buffer->data(), tmp_buffer->size()); - - size_t size = std::min(memory_buffer_remaining_size, size_to_upload - uploaded_size); - memcpy(tmp_buffer->data() + tmp_buffer_write_offset, working_buffer.begin() + uploaded_size, size); - uploaded_size += size; - tmp_buffer_write_offset += size; + writePart(std::move(detached_part_data.front())); + detached_part_data.pop_front(); } - if (tmp_buffer_write_offset == max_single_part_upload_size) - uploadBlock(tmp_buffer->data(), tmp_buffer->size()); + allocateBuffer(); +} - if (write_settings.remote_throttler) - write_settings.remote_throttler->add(size_to_upload, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); +void WriteBufferFromAzureBlobStorage::allocateBuffer() +{ + memory = Memory(max_single_part_upload_size); + WriteBuffer::set(memory.data(), memory.size()); +} + + +void WriteBufferFromAzureBlobStorage::reallocateBuffer() +{ + chassert(offset() == 0); + + if (available() > 0) + return; + + if (memory.size() == max_single_part_upload_size) + return; + + memory.resize(max_single_part_upload_size); + + WriteBuffer::set(memory.data(), memory.size()); + + chassert(offset() == 0); +} + +void WriteBufferFromAzureBlobStorage::detachBuffer() +{ + size_t data_size = size_t(position() - memory.data()); + auto buf = std::move(memory); + WriteBuffer::set(nullptr, 0); + detached_part_data.push_back({std::move(buf), data_size}); +} + +void WriteBufferFromAzureBlobStorage::writePart(WriteBufferFromAzureBlobStorage::PartData && data) +{ + if (data.data_size == 0) + return; + + auto upload_worker = [&] () + { + auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); + const std::string & block_id = block_ids.emplace_back(getRandomASCIIString(64)); + + Azure::Core::IO::MemoryBodyStream memory_stream(reinterpret_cast(data.memory.data()), data.data_size); + execWithRetry([&](){ block_blob_client.StageBlock(block_id, memory_stream); }, max_unexpected_write_error_retries, data.data_size); + + if (write_settings.remote_throttler) + write_settings.remote_throttler->add(data.data_size, ProfileEvents::RemoteWriteThrottlerBytes, ProfileEvents::RemoteWriteThrottlerSleepMicroseconds); + }; + + task_tracker->add(std::move(upload_worker)); } } diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index f105b35c121..2d11014fa2a 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -11,6 +11,7 @@ #include #include #include +#include namespace Poco @@ -21,6 +22,8 @@ class Logger; namespace DB { +class TaskTracker; + class WriteBufferFromAzureBlobStorage : public WriteBufferFromFileBase { public: @@ -32,7 +35,9 @@ public: size_t max_single_part_upload_size_, size_t max_unexpected_write_error_retries_, size_t buf_size_, - const WriteSettings & write_settings_); + const WriteSettings & write_settings_, + size_t max_inflight_parts_for_one_file_, + ThreadPoolCallbackRunner schedule_ = {}); ~WriteBufferFromAzureBlobStorage() override; @@ -42,11 +47,21 @@ public: void sync() override { next(); } private: + struct PartData; + + void writePart(WriteBufferFromAzureBlobStorage::PartData && data); + void detachBuffer(); + void allocateBuffer(); + void allocateFirstBuffer(); + void reallocateFirstBuffer(); + void reallocateBuffer(); + void finalizeImpl() override; void execWithRetry(std::function func, size_t num_tries, size_t cost = 0); void uploadBlock(const char * data, size_t size); LoggerPtr log; + LogSeriesLimiterPtr limitedLog = std::make_shared(log, 1, 5); const size_t max_single_part_upload_size; const size_t max_unexpected_write_error_retries; @@ -61,6 +76,11 @@ private: size_t tmp_buffer_write_offset = 0; MemoryBufferPtr allocateBuffer() const; + + bool first_buffer=true; + + std::unique_ptr task_tracker; + std::deque detached_part_data; }; } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 72c4abee5c9..f99586b2d1a 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -169,7 +169,8 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), config.getUInt64(config_prefix + ".max_single_part_copy_size", context->getSettings().azure_max_single_part_copy_size), config.getBool(config_prefix + ".use_native_copy", false), - config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries) + config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries), + config.getUInt64(config_prefix + ".max_inflight_parts_for_one_file", context->getSettings().azure_max_inflight_parts_for_one_file) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 74389aedb64..844789ea5b5 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -268,7 +268,8 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO settings.get()->max_single_part_upload_size, settings.get()->max_unexpected_write_error_retries, buf_size, - patchSettings(write_settings)); + patchSettings(write_settings), + settings.get()->max_inflight_parts_for_one_file); } /// Remove file. Throws exception if file doesn't exists or it's a directory. diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index f16c35fb52c..1b473a01304 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -27,7 +27,8 @@ struct AzureObjectStorageSettings size_t max_upload_part_size_, size_t max_single_part_copy_size_, bool use_native_copy_, - size_t max_unexpected_write_error_retries_) + size_t max_unexpected_write_error_retries_, + size_t max_inflight_parts_for_one_file_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) @@ -37,6 +38,7 @@ struct AzureObjectStorageSettings , max_single_part_copy_size(max_single_part_copy_size_) , use_native_copy(use_native_copy_) , max_unexpected_write_error_retries (max_unexpected_write_error_retries_) + , max_inflight_parts_for_one_file (max_inflight_parts_for_one_file_) { } @@ -52,6 +54,7 @@ struct AzureObjectStorageSettings size_t max_single_part_copy_size = 256 * 1024 * 1024; bool use_native_copy = false; size_t max_unexpected_write_error_retries = 4; + size_t max_inflight_parts_for_one_file = 20; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 5bb01050591..6fc0a35672f 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -95,7 +95,7 @@ WriteBufferFromS3::WriteBufferFromS3( , object_metadata(std::move(object_metadata_)) , buffer_allocation_policy(ChooseBufferPolicy(upload_settings)) , task_tracker( - std::make_unique( + std::make_unique( std::move(schedule_), upload_settings.max_inflight_parts_for_one_file, limitedLog)) diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 230f39b074e..f3637122ee4 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -26,6 +27,8 @@ namespace DB * Data is divided on chunks with size greater than 'minimum_upload_part_size'. Last chunk can be less than this threshold. * Each chunk is written as a part to S3. */ +class TaskTracker; + class WriteBufferFromS3 final : public WriteBufferFromFileBase { public: @@ -118,7 +121,7 @@ private: size_t total_size = 0; size_t hidden_size = 0; - class TaskTracker; +// class TaskTracker; std::unique_ptr task_tracker; BlobStorageLogWriterPtr blob_log; diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/IO/WriteBufferFromS3TaskTracker.cpp index bce122dd6c8..e62de261fc2 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.cpp +++ b/src/IO/WriteBufferFromS3TaskTracker.cpp @@ -1,7 +1,5 @@ #include "config.h" -#if USE_AWS_S3 - #include namespace ProfileEvents @@ -12,19 +10,19 @@ namespace ProfileEvents namespace DB { -WriteBufferFromS3::TaskTracker::TaskTracker(ThreadPoolCallbackRunner scheduler_, size_t max_tasks_inflight_, LogSeriesLimiterPtr limitedLog_) +TaskTracker::TaskTracker(ThreadPoolCallbackRunner scheduler_, size_t max_tasks_inflight_, LogSeriesLimiterPtr limitedLog_) : is_async(bool(scheduler_)) , scheduler(scheduler_ ? std::move(scheduler_) : syncRunner()) , max_tasks_inflight(max_tasks_inflight_) , limitedLog(limitedLog_) {} -WriteBufferFromS3::TaskTracker::~TaskTracker() +TaskTracker::~TaskTracker() { safeWaitAll(); } -ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() +ThreadPoolCallbackRunner TaskTracker::syncRunner() { return [](Callback && callback, int64_t) mutable -> std::future { @@ -35,7 +33,7 @@ ThreadPoolCallbackRunner WriteBufferFromS3::TaskTracker::syncRunner() }; } -void WriteBufferFromS3::TaskTracker::waitAll() +void TaskTracker::waitAll() { /// Exceptions are propagated for (auto & future : futures) @@ -48,7 +46,7 @@ void WriteBufferFromS3::TaskTracker::waitAll() finished_futures.clear(); } -void WriteBufferFromS3::TaskTracker::safeWaitAll() +void TaskTracker::safeWaitAll() { for (auto & future : futures) { @@ -71,7 +69,7 @@ void WriteBufferFromS3::TaskTracker::safeWaitAll() finished_futures.clear(); } -void WriteBufferFromS3::TaskTracker::waitIfAny() +void TaskTracker::waitIfAny() { if (futures.empty()) return; @@ -99,7 +97,7 @@ void WriteBufferFromS3::TaskTracker::waitIfAny() ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds()); } -void WriteBufferFromS3::TaskTracker::add(Callback && func) +void TaskTracker::add(Callback && func) { /// All this fuzz is about 2 things. This is the most critical place of TaskTracker. /// The first is not to fail insertion in the list `futures`. @@ -134,7 +132,7 @@ void WriteBufferFromS3::TaskTracker::add(Callback && func) waitTilInflightShrink(); } -void WriteBufferFromS3::TaskTracker::waitTilInflightShrink() +void TaskTracker::waitTilInflightShrink() { if (!max_tasks_inflight) return; @@ -166,11 +164,10 @@ void WriteBufferFromS3::TaskTracker::waitTilInflightShrink() ProfileEvents::increment(ProfileEvents::WriteBufferFromS3WaitInflightLimitMicroseconds, watch.elapsedMicroseconds()); } -bool WriteBufferFromS3::TaskTracker::isAsync() const +bool TaskTracker::isAsync() const { return is_async; } } -#endif diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/IO/WriteBufferFromS3TaskTracker.h index 815e041ae52..134abbbc4c1 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.h +++ b/src/IO/WriteBufferFromS3TaskTracker.h @@ -1,9 +1,7 @@ #pragma once #include "config.h" - -#if USE_AWS_S3 - +#include #include "WriteBufferFromS3.h" #include @@ -22,7 +20,7 @@ namespace DB /// Basic exception safety is provided. If exception occurred the object has to be destroyed. /// No thread safety is provided. Use this object with no concurrency. -class WriteBufferFromS3::TaskTracker +class TaskTracker { public: using Callback = std::function; @@ -68,5 +66,3 @@ private: }; } - -#endif From 26fd3d0d852986b6bbaf595087cb0d06bdff9f93 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 14 Feb 2024 16:13:53 +0100 Subject: [PATCH 0024/1081] Removed offset check --- src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index cbe2367823d..d700090303a 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -126,8 +126,6 @@ void WriteBufferFromAzureBlobStorage::allocateBuffer() void WriteBufferFromAzureBlobStorage::reallocateBuffer() { - chassert(offset() == 0); - if (available() > 0) return; From 750a82a4ff615190a2793c0cfae9f4c1f5c75433 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 15 Feb 2024 13:23:33 +0100 Subject: [PATCH 0025/1081] Update doc --- .../mergetree-family/mergetree.md | 2 + docs/en/operations/storing-data.md | 146 ++++++++++++++++-- 2 files changed, 134 insertions(+), 14 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index f185c11bab3..e1eef8db9ab 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -1106,6 +1106,8 @@ Configuration markup: ``` +Also see [configuring external storage options](/docs/en/operations/storing-data.md/#configuring-external-storage). + :::note cache configuration ClickHouse versions 22.3 through 22.7 use a different cache configuration, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) if you are using one of those versions. ::: diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 003277c8d4f..7a7edfb1a90 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -11,45 +11,163 @@ To work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-en To load data from a web server with static files use a disk with type [web](#storing-data-on-webserver). -## Configuring HDFS {#configuring-hdfs} +## Configuring external storage {#configuring-external-storage} -[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`. +[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` using a disk with types `s3`, `azure_blob_storage`, `hdfs` accordingly. Configuration markup: +Let's take a loop at different storage configuration options on the example of `S3` storage. +Firstly, define configuration in server configuration file. In order to configure `S3` storage the following configuration can be used: + ``` xml - - hdfs - hdfs://hdfs1:9000/clickhouse/ - + + s3 + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + - +
- hdfs + s3
-
+
+
+``` +Starting with 24.1 clickhouse version, a different type of configuration is supported in addition to the older one: + +``` xml + + + + + object_storage + s3 + local + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + + + + + +
+ s3 +
+
+
+
+
+
+``` + +In order to make a specific kind of storage a default option for all `MergeTree` tables add the following section to configuration file: + +``` xml + - 0 + s3 ``` -Required parameters: +If you want to configure a specific storage policy only to specific table, you can define it in settings while creating the table: -- `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data. +``` sql +CREATE TABLE test (a Int32, b String) +ENGINE = MergeTree() ORDER BY a +SETTINGS storage_policy = 's3'; +``` -Optional parameters: +You can also use `disk` instead of `storage_policy`. In this case it is not requires to have `storage_policy` section in configuration file, only `disk` section would be enough. -- `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`. +``` sql +CREATE TABLE test (a Int32, b String) +ENGINE = MergeTree() ORDER BY a +SETTINGS disk = 's3'; +``` + +There is also a possibility to specify storage configuration without a preconfigured disk in configuration file: + +``` sql +CREATE TABLE test (a Int32, b String) +ENGINE = MergeTree() ORDER BY a +SETTINGS disk = disk(name = 's3_disk', type = 's3', endpoint = 'https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/', use_environment_credentials = 1); +``` + +Adding cache is also possible: + +``` sql +CREATE TABLE test (a Int32, b String) +ENGINE = MergeTree() ORDER BY a +SETTINGS disk = disk(name = 'cached_s3_disk', type = 'cache', max_size = '10Gi', path = '/s3_cache', disk = disk(name = 's3_disk', type = 's3', endpoint = 'https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/', use_environment_credentials = 1)); +``` + +A combination of config file disk configuration and sql-defined configuration is also possible: + +``` sql +CREATE TABLE test (a Int32, b String) +ENGINE = MergeTree() ORDER BY a +SETTINGS disk = disk(name = 'cached_s3_disk', type = 'cache', max_size = '10Gi', path = '/s3_cache', disk = 's3'); +``` + +Here `s3` is a disk name from server configuration file, while `cache` disk is defined via sql. + +Let's take a closer look at configuration parameters. + +All disk configuration require `type` section, equal to one of `s3`, `azure_blob_storage`, `hdfs`, `local`, `cache`, `web`. Then goes configuration of a specific storage type. +Starting from 24.1 clickhouse version, you can you a new configuration option. For it you are required to specify `type` as `object_storage`, `object_storage_type` as one of `s3`, `azure_blob_storage`, `hdfs`, `local`, `cache`, `web`, and optionally you can specify `metadata_type`, which is `local` by default, but it can also be set to `plain`, `web`. + +E.g. first configuration option: +``` xml + + s3 + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +and second (from `24.1`): +``` xml + + object_storage + s3 + local + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +Configuration like +``` xml + + s3_plain + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +is equal to +``` xml + + object_storage + s3 + plain + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +For details configuration options of each storage see [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md). ## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system} From 9bcd4daabe56e29132fc5098420afb4dcba9001d Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 15 Feb 2024 16:19:31 +0100 Subject: [PATCH 0026/1081] Better --- .../mergetree-family/mergetree.md | 294 +------------ docs/en/operations/storing-data.md | 411 +++++++++++++++--- 2 files changed, 346 insertions(+), 359 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index e1eef8db9ab..0fff13c906f 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -987,49 +987,6 @@ ORDER BY (postcode1, postcode2, addr1, addr2) # highlight-end ``` -### Nested Dynamic Storage - -This example query builds on the above dynamic disk configuration and shows how to -use a local disk to cache data from a table stored at a URL. Neither the cache disk -nor the web storage is configured in the ClickHouse configuration files; both are -configured in the CREATE/ATTACH query settings. - -In the settings highlighted below notice that the disk of `type=web` is nested within -the disk of `type=cache`. - -```sql -ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' -( - price UInt32, - date Date, - postcode1 LowCardinality(String), - postcode2 LowCardinality(String), - type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), - is_new UInt8, - duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), - addr1 String, - addr2 String, - street LowCardinality(String), - locality LowCardinality(String), - town LowCardinality(String), - district LowCardinality(String), - county LowCardinality(String) -) -ENGINE = MergeTree -ORDER BY (postcode1, postcode2, addr1, addr2) - # highlight-start - SETTINGS disk = disk( - type=cache, - max_size='1Gi', - path='/var/lib/clickhouse/custom_disk_cache/', - disk=disk( - type=web, - endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' - ) - ); - # highlight-end -``` - ### Details {#details} In the case of `MergeTree` tables, data is getting to disk in different ways: @@ -1058,19 +1015,17 @@ During this time, they are not moved to other volumes or disks. Therefore, until User can assign new big parts to different disks of a [JBOD](https://en.wikipedia.org/wiki/Non-RAID_drive_architectures) volume in a balanced way using the [min_bytes_to_rebalance_partition_over_jbod](/docs/en/operations/settings/merge-tree-settings.md/#min-bytes-to-rebalance-partition-over-jbod) setting. -## Using S3 for Data Storage {#table_engine-mergetree-s3} +## Using External Storage for Data Storage {#table_engine-mergetree-s3} -:::note -Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/gcs). -::: +[MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` using a disk with types `s3`, `azure_blob_storage`, `hdfs` accordingly. See [configuring external storage options](/docs/en/operations/storing-data.md/#configuring-external-storage) for more details. -`MergeTree` family table engines can store data to [S3](https://aws.amazon.com/s3/) using a disk with type `s3`. +Example for [S3](https://aws.amazon.com/s3/) as external storage using a disk with type `s3`. Configuration markup: ``` xml ... - +e s3 true @@ -1112,247 +1067,6 @@ Also see [configuring external storage options](/docs/en/operations/storing-data ClickHouse versions 22.3 through 22.7 use a different cache configuration, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) if you are using one of those versions. ::: -### Configuring the S3 disk - -Required parameters: - -- `endpoint` — S3 endpoint URL in `path` or `virtual hosted` [styles](https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). Endpoint URL should contain a bucket and root path to store data. -- `access_key_id` — S3 access key id. -- `secret_access_key` — S3 secret access key. - -Optional parameters: - -- `region` — S3 region name. -- `support_batch_delete` — This controls the check to see if batch deletes are supported. Set this to `false` when using Google Cloud Storage (GCS) as GCS does not support batch deletes and preventing the checks will prevent error messages in the logs. -- `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`. -- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`. -- `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`. -- `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL. -- `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`. -- `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`. -- `retry_attempts` — Number of retry attempts in case of failed request. Default value is `10`. -- `single_read_retries` — Number of retry attempts in case of connection drop during read. Default value is `4`. -- `min_bytes_for_seek` — Minimal number of bytes to use seek operation instead of sequential read. Default value is `1 Mb`. -- `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks//`. -- `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`. -- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. -- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. -- `server_side_encryption_kms_key_id` - If specified, required headers for accessing S3 objects with [SSE-KMS encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) will be set. If an empty string is specified, the AWS managed S3 key will be used. Optional. -- `server_side_encryption_kms_encryption_context` - If specified alongside `server_side_encryption_kms_key_id`, the given encryption context header for SSE-KMS will be set. Optional. -- `server_side_encryption_kms_bucket_key_enabled` - If specified alongside `server_side_encryption_kms_key_id`, the header to enable S3 bucket keys for SSE-KMS will be set. Optional, can be `true` or `false`, defaults to nothing (matches the bucket-level setting). -- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). -- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. -- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). -- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. -- `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). -- `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). -- `key_template` — Define the format with which the object keys are generated. By default, Clickhouse takes `root path` from `endpoint` option and adds random generated suffix. That suffix is a dir with 3 random symbols and a file name with 29 random symbols. With that option you have a full control how to the object keys are generated. Some usage scenarios require having random symbols in the prefix or in the middle of object key. For example: `[a-z]{3}-prefix-random/constant-part/random-middle-[a-z]{3}/random-suffix-[a-z]{29}`. The value is parsed with [`re2`](https://github.com/google/re2/wiki/Syntax). Only some subset of the syntax is supported. Check if your preferred format is supported before using that option. Disk isn't initialized if clickhouse is unable to generate a key by the value of `key_template`. It requires enabled feature flag [storage_metadata_write_full_object_key](/docs/en/operations/settings/settings#storage_metadata_write_full_object_key). It forbids declaring the `root path` in `endpoint` option. It requires definition of the option `key_compatibility_prefix`. -- `key_compatibility_prefix` — That option is required when option `key_template` is in use. In order to be able to read the objects keys which were stored in the metadata files with the metadata version lower that `VERSION_FULL_OBJECT_KEY`, the previous `root path` from the `endpoint` option should be set here. - -### Configuring the cache - -This is the cache configuration from above: -```xml - - cache - s3 - /var/lib/clickhouse/disks/s3_cache/ - 10Gi - -``` - -These parameters define the cache layer: -- `type` — If a disk is of type `cache` it caches mark and index files in memory. -- `disk` — The name of the disk that will be cached. - -Cache parameters: -- `path` — The path where metadata for the cache is stored. -- `max_size` — The size (amount of disk space) that the cache can grow to. - -:::tip -There are several other cache parameters that you can use to tune your storage, see [using local cache](/docs/en/operations/storing-data.md/#using-local-cache) for the details. -::: - -S3 disk can be configured as `main` or `cold` storage: -``` xml - - ... - - - s3 - https://clickhouse-public-datasets.s3.amazonaws.com/my-bucket/root-path/ - your_access_key_id - your_secret_access_key - - - - - -
- s3 -
-
-
- - -
- default -
- - s3 - -
- 0.2 -
-
- ... -
-``` - -In case of `cold` option a data can be moved to S3 if local disk free size will be smaller than `move_factor * disk_size` or by TTL move rule. - -## Using Azure Blob Storage for Data Storage {#table_engine-mergetree-azure-blob-storage} - -`MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`. - -As of February 2022, this feature is still a fresh addition, so expect that some Azure Blob Storage functionalities might be unimplemented. - -Configuration markup: -``` xml - - ... - - - azure_blob_storage - http://account.blob.core.windows.net - container - account - pass123 - /var/lib/clickhouse/disks/blob_storage_disk/ - /var/lib/clickhouse/disks/blob_storage_disk/cache/ - false - - - ... - -``` - -Connection parameters: -* `storage_account_url` - **Required**, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. -* `container_name` - Target container name, defaults to `default-container`. -* `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet. - -Authentication parameters (the disk will try all available methods **and** Managed Identity Credential): -* `connection_string` - For authentication using a connection string. -* `account_name` and `account_key` - For authentication using Shared Key. - -Limit parameters (mainly for internal usage): -* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage. -* `min_bytes_for_seek` - Limits the size of a seekable region. -* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage. -* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage. -* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated. -* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurrently for one object. - -Other parameters: -* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks//`. -* `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`. -* `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). -* `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). - -Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)). - -:::note Zero-copy replication is not ready for production -Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. -::: - -## HDFS storage {#hdfs-storage} - -In this sample configuration: -- the disk is of type `hdfs` -- the data is hosted at `hdfs://hdfs1:9000/clickhouse/` - -```xml - - - - - hdfs - hdfs://hdfs1:9000/clickhouse/ - true - - - local - / - - - - - -
- hdfs -
- - hdd - -
-
-
-
-
-``` - -## Web storage (read-only) {#web-storage} - -Web storage can be used for read-only purposes. An example use is for hosting sample -data, or for migrating data. - -:::tip -Storage can also be configured temporarily within a query, if a web dataset is not expected -to be used routinely, see [dynamic storage](#dynamic-storage) and skip editing the -configuration file. -::: - -In this sample configuration: -- the disk is of type `web` -- the data is hosted at `http://nginx:80/test1/` -- a cache on local storage is used - -```xml - - - - - web - http://nginx:80/test1/ - - - cache - web - cached_web_cache/ - 100000000 - - - - - -
- web -
-
-
- - -
- cached_web -
-
-
-
-
-
-``` - ## Virtual Columns {#virtual-columns} - `_part` — Name of a part. diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 7a7edfb1a90..baf4e1999a7 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -5,21 +5,68 @@ sidebar_label: "External Disks for Storing Data" title: "External Disks for Storing Data" --- -Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon S3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). +Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely. Various storages are supported: +1. [Amazon S3](https://aws.amazon.com/s3/) object storage. +2. The Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)) +3. [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). -To work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine. - -To load data from a web server with static files use a disk with type [web](#storing-data-on-webserver). +Note: to work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine, and to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/AzureBlobStorage.md) table engine. They are different from external storage described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` famility or `Log` family tables. ## Configuring external storage {#configuring-external-storage} [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) family table engines can store data to `S3`, `AzureBlobStorage`, `HDFS` using a disk with types `s3`, `azure_blob_storage`, `hdfs` accordingly. -Configuration markup: +Disk configuration requires: +1. `type` section, equal to one of `s3`, `azure_blob_storage`, `hdfs`, `local_blob_storage`, `web`. +2. Configuration of a specific external storage type. -Let's take a loop at different storage configuration options on the example of `S3` storage. -Firstly, define configuration in server configuration file. In order to configure `S3` storage the following configuration can be used: +Starting from 24.1 clickhouse version, it is possible to use a new configuration option. +It requires to specify: +1. `type` equal to `object_storage` +2. `object_storage_type`, equal to one of `s3`, `azure_blob_storage`, `hdfs`, `local_blob_storage`, `web`. +Optionally, `metadata_type` can be specified (it is equal to `local` by default), but it can also be set to `plain`, `web`. +E.g. configuration option +``` xml + + s3 + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +is equal to configuration (from `24.1`): +``` xml + + object_storage + s3 + local + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +Configuration +``` xml + + s3_plain + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +is equal to +``` xml + + object_storage + s3 + plain + https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ + 1 + +``` + +Example of full storage configuration will look like: ``` xml @@ -43,8 +90,7 @@ Firstly, define configuration in server configuration file. In order to configur ``` -Starting with 24.1 clickhouse version, a different type of configuration is supported in addition to the older one: - +Starting with 24.1 clickhouse version, it can also look like: ``` xml @@ -71,7 +117,6 @@ Starting with 24.1 clickhouse version, a different type of configuration is supp ``` In order to make a specific kind of storage a default option for all `MergeTree` tables add the following section to configuration file: - ``` xml @@ -96,80 +141,259 @@ ENGINE = MergeTree() ORDER BY a SETTINGS disk = 's3'; ``` -There is also a possibility to specify storage configuration without a preconfigured disk in configuration file: +## Dynamic Configuration {#dynamic-configuration} -``` sql -CREATE TABLE test (a Int32, b String) -ENGINE = MergeTree() ORDER BY a -SETTINGS disk = disk(name = 's3_disk', type = 's3', endpoint = 'https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/', use_environment_credentials = 1); +There is also a possibility to specify storage configuration without a predefined disk in configuration in a configuration file, but can be configured in the CREATE/ATTACH query settings. + +The following example query builds on the above dynamic disk configuration and shows how to use a local disk to cache data from a table stored at a URL. + +```sql +ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' +( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), + is_new UInt8, + duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String) +) +ENGINE = MergeTree +ORDER BY (postcode1, postcode2, addr1, addr2) + # highlight-start + SETTINGS disk = disk( + type=web, + endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' + ); + # highlight-end ``` -Adding cache is also possible: +The example below adds cache to external storage. -``` sql -CREATE TABLE test (a Int32, b String) -ENGINE = MergeTree() ORDER BY a -SETTINGS disk = disk(name = 'cached_s3_disk', type = 'cache', max_size = '10Gi', path = '/s3_cache', disk = disk(name = 's3_disk', type = 's3', endpoint = 'https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/', use_environment_credentials = 1)); +```sql +ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' +( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), + is_new UInt8, + duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String) +) +ENGINE = MergeTree +ORDER BY (postcode1, postcode2, addr1, addr2) + # highlight-start + SETTINGS disk = disk( + type=cache, + max_size='1Gi', + path='/var/lib/clickhouse/custom_disk_cache/', + disk=disk( + type=web, + endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' + ) + ); + # highlight-end ``` -A combination of config file disk configuration and sql-defined configuration is also possible: +In the settings highlighted below notice that the disk of `type=web` is nested within +the disk of `type=cache`. -``` sql -CREATE TABLE test (a Int32, b String) -ENGINE = MergeTree() ORDER BY a -SETTINGS disk = disk(name = 'cached_s3_disk', type = 'cache', max_size = '10Gi', path = '/s3_cache', disk = 's3'); +A combination of config-based configuration and sql-defined configuration is also possible: + +```sql +ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' +( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), + is_new UInt8, + duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String) +) +ENGINE = MergeTree +ORDER BY (postcode1, postcode2, addr1, addr2) + # highlight-start + SETTINGS disk = disk( + type=cache, + max_size='1Gi', + path='/var/lib/clickhouse/custom_disk_cache/', + disk=disk( + type=web, + endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' + ) + ); + # highlight-end ``` -Here `s3` is a disk name from server configuration file, while `cache` disk is defined via sql. +where `web` is a from a server configuration file: -Let's take a closer look at configuration parameters. - -All disk configuration require `type` section, equal to one of `s3`, `azure_blob_storage`, `hdfs`, `local`, `cache`, `web`. Then goes configuration of a specific storage type. -Starting from 24.1 clickhouse version, you can you a new configuration option. For it you are required to specify `type` as `object_storage`, `object_storage_type` as one of `s3`, `azure_blob_storage`, `hdfs`, `local`, `cache`, `web`, and optionally you can specify `metadata_type`, which is `local` by default, but it can also be set to `plain`, `web`. - -E.g. first configuration option: ``` xml - - s3 - https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ - 1 - + + + + web + 'https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' + + + ``` -and second (from `24.1`): +### Using S3 Storage {#s3-storage} + +Required parameters: + +- `endpoint` — S3 endpoint URL in `path` or `virtual hosted` [styles](https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html). Endpoint URL should contain a bucket and root path to store data. +- `access_key_id` — S3 access key id. +- `secret_access_key` — S3 secret access key. + +Optional parameters: + +- `region` — S3 region name. +- `support_batch_delete` — This controls the check to see if batch deletes are supported. Set this to `false` when using Google Cloud Storage (GCS) as GCS does not support batch deletes and preventing the checks will prevent error messages in the logs. +- `use_environment_credentials` — Reads AWS credentials from the Environment variables AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY and AWS_SESSION_TOKEN if they exist. Default value is `false`. +- `use_insecure_imds_request` — If set to `true`, S3 client will use insecure IMDS request while obtaining credentials from Amazon EC2 metadata. Default value is `false`. +- `expiration_window_seconds` — Grace period for checking if expiration-based credentials have expired. Optional, default value is `120`. +- `proxy` — Proxy configuration for S3 endpoint. Each `uri` element inside `proxy` block should contain a proxy URL. +- `connect_timeout_ms` — Socket connect timeout in milliseconds. Default value is `10 seconds`. +- `request_timeout_ms` — Request timeout in milliseconds. Default value is `5 seconds`. +- `retry_attempts` — Number of retry attempts in case of failed request. Default value is `10`. +- `single_read_retries` — Number of retry attempts in case of connection drop during read. Default value is `4`. +- `min_bytes_for_seek` — Minimal number of bytes to use seek operation instead of sequential read. Default value is `1 Mb`. +- `metadata_path` — Path on local FS to store metadata files for S3. Default value is `/var/lib/clickhouse/disks//`. +- `skip_access_check` — If true, disk access checks will not be performed on disk start-up. Default value is `false`. +- `header` — Adds specified HTTP header to a request to given endpoint. Optional, can be specified multiple times. +- `server_side_encryption_customer_key_base64` — If specified, required headers for accessing S3 objects with SSE-C encryption will be set. +- `server_side_encryption_kms_key_id` - If specified, required headers for accessing S3 objects with [SSE-KMS encryption](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingKMSEncryption.html) will be set. If an empty string is specified, the AWS managed S3 key will be used. Optional. +- `server_side_encryption_kms_encryption_context` - If specified alongside `server_side_encryption_kms_key_id`, the given encryption context header for SSE-KMS will be set. Optional. +- `server_side_encryption_kms_bucket_key_enabled` - If specified alongside `server_side_encryption_kms_key_id`, the header to enable S3 bucket keys for SSE-KMS will be set. Optional, can be `true` or `false`, defaults to nothing (matches the bucket-level setting). +- `s3_max_put_rps` — Maximum PUT requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_put_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_put_rps`. +- `s3_max_get_rps` — Maximum GET requests per second rate before throttling. Default value is `0` (unlimited). +- `s3_max_get_burst` — Max number of requests that can be issued simultaneously before hitting request per second limit. By default (`0` value) equals to `s3_max_get_rps`. +- `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). +- `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). +- `key_template` — Define the format with which the object keys are generated. By default, Clickhouse takes `root path` from `endpoint` option and adds random generated suffix. That suffix is a dir with 3 random symbols and a file name with 29 random symbols. With that option you have a full control how to the object keys are generated. Some usage scenarios require having random symbols in the prefix or in the middle of object key. For example: `[a-z]{3}-prefix-random/constant-part/random-middle-[a-z]{3}/random-suffix-[a-z]{29}`. The value is parsed with [`re2`](https://github.com/google/re2/wiki/Syntax). Only some subset of the syntax is supported. Check if your preferred format is supported before using that option. Disk isn't initialized if clickhouse is unable to generate a key by the value of `key_template`. It requires enabled feature flag [storage_metadata_write_full_object_key](/docs/en/operations/settings/settings#storage_metadata_write_full_object_key). It forbids declaring the `root path` in `endpoint` option. It requires definition of the option `key_compatibility_prefix`. +- `key_compatibility_prefix` — That option is required when option `key_template` is in use. In order to be able to read the objects keys which were stored in the metadata files with the metadata version lower that `VERSION_FULL_OBJECT_KEY`, the previous `root path` from the `endpoint` option should be set here. + +:::note +Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/gcs). +::: + +### Using Azure Blob Storage {#azure-blob-storage} + +`MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`. + +As of February 2022, this feature is still a fresh addition, so expect that some Azure Blob Storage functionalities might be unimplemented. + +Configuration markup: ``` xml - - object_storage - s3 - local - https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ - 1 - + + ... + + + azure_blob_storage + http://account.blob.core.windows.net + container + account + pass123 + /var/lib/clickhouse/disks/blob_storage_disk/ + /var/lib/clickhouse/disks/blob_storage_disk/cache/ + false + + + ... + ``` -Configuration like -``` xml - - s3_plain - https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ - 1 - +Connection parameters: +* `storage_account_url` - **Required**, Azure Blob Storage account URL, like `http://account.blob.core.windows.net` or `http://azurite1:10000/devstoreaccount1`. +* `container_name` - Target container name, defaults to `default-container`. +* `container_already_exists` - If set to `false`, a new container `container_name` is created in the storage account, if set to `true`, disk connects to the container directly, and if left unset, disk connects to the account, checks if the container `container_name` exists, and creates it if it doesn't exist yet. + +Authentication parameters (the disk will try all available methods **and** Managed Identity Credential): +* `connection_string` - For authentication using a connection string. +* `account_name` and `account_key` - For authentication using Shared Key. + +Limit parameters (mainly for internal usage): +* `s3_max_single_part_upload_size` - Limits the size of a single block upload to Blob Storage. +* `min_bytes_for_seek` - Limits the size of a seekable region. +* `max_single_read_retries` - Limits the number of attempts to read a chunk of data from Blob Storage. +* `max_single_download_retries` - Limits the number of attempts to download a readable buffer from Blob Storage. +* `thread_pool_size` - Limits the number of threads with which `IDiskRemote` is instantiated. +* `s3_max_inflight_parts_for_one_file` - Limits the number of put requests that can be run concurrently for one object. + +Other parameters: +* `metadata_path` - Path on local FS to store metadata files for Blob Storage. Default value is `/var/lib/clickhouse/disks//`. +* `skip_access_check` - If true, disk access checks will not be performed on disk start-up. Default value is `false`. +* `read_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of read requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). +* `write_resource` — Resource name to be used for [scheduling](/docs/en/operations/workload-scheduling.md) of write requests to this disk. Default value is empty string (IO scheduling is not enabled for this disk). + +Examples of working configurations can be found in integration tests directory (see e.g. [test_merge_tree_azure_blob_storage](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_merge_tree_azure_blob_storage/configs/config.d/storage_conf.xml) or [test_azure_blob_storage_zero_copy_replication](https://github.com/ClickHouse/ClickHouse/blob/master/tests/integration/test_azure_blob_storage_zero_copy_replication/configs/config.d/storage_conf.xml)). + +:::note Zero-copy replication is not ready for production +Zero-copy replication is disabled by default in ClickHouse version 22.8 and higher. This feature is not recommended for production use. +::: + +## Using HDFS storage {#hdfs-storage} + +In this sample configuration: +- the disk is of type `hdfs` +- the data is hosted at `hdfs://hdfs1:9000/clickhouse/` + +```xml + + + + + hdfs + hdfs://hdfs1:9000/clickhouse/ + true + + + local + / + + + + + +
+ hdfs +
+ + hdd + +
+
+
+
+
``` -is equal to -``` xml - - object_storage - s3 - plain - https://s3.eu-west-1.amazonaws.com/clickhouse-eu-west-1.clickhouse.com/data/ - 1 - -``` - -For details configuration options of each storage see [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md). - -## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system} +### Using Data Encryption {#encrypted-virtual-file-system} You can encrypt the data stored on [S3](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one. @@ -230,7 +454,7 @@ Example of disk configuration:
``` -## Using local cache {#using-local-cache} +### Using local cache {#using-local-cache} It is possible to configure local cache over disks in storage configuration starting from version 22.3. For versions 22.3 - 22.7 cache is supported only for `s3` disk type. For versions >= 22.8 cache is supported for any disk type: S3, Azure, Local, Encrypted, etc. @@ -393,7 +617,56 @@ Cache profile events: - `CachedWriteBufferCacheWriteBytes`, `CachedWriteBufferCacheWriteMicroseconds` -## Storing Data on Web Server {#storing-data-on-webserver} +### Using static Web storage (read-only) {#web-storage} + +Web storage can be used for read-only purposes. An example use is for hosting sample +data, or for migrating data. + +:::tip +Storage can also be configured temporarily within a query, if a web dataset is not expected +to be used routinely, see [dynamic storage](#dynamic-storage) and skip editing the +configuration file. +::: + +In this sample configuration: +- the disk is of type `web` +- the data is hosted at `http://nginx:80/test1/` +- a cache on local storage is used + +```xml + + + + + web + http://nginx:80/test1/ + + + cache + web + cached_web_cache/ + 100000000 + + + + + +
+ web +
+
+
+ + +
+ cached_web +
+
+
+
+
+
+``` There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. @@ -595,7 +868,7 @@ If URL is not reachable on disk load when the server is starting up tables, then Use [http_max_single_read_retries](/docs/en/operations/settings/settings.md/#http-max-single-read-retries) setting to limit the maximum number of retries during a single HTTP read. -## Zero-copy Replication (not ready for production) {#zero-copy} +### Zero-copy Replication (not ready for production) {#zero-copy} Zero-copy replication is possible, but not recommended, with `S3` and `HDFS` disks. Zero-copy replication means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself. From 5ae410e6339fe52e33b41bbc9c6c115ac6293f57 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 15 Feb 2024 18:33:38 +0100 Subject: [PATCH 0027/1081] A bit more explanation --- .../mergetree-family/mergetree.md | 49 +------------------ docs/en/operations/storing-data.md | 44 ++++++++++++++++- 2 files changed, 44 insertions(+), 49 deletions(-) diff --git a/docs/en/engines/table-engines/mergetree-family/mergetree.md b/docs/en/engines/table-engines/mergetree-family/mergetree.md index 0fff13c906f..f23b251f3a1 100644 --- a/docs/en/engines/table-engines/mergetree-family/mergetree.md +++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md @@ -940,53 +940,6 @@ You could change storage policy after table creation with [ALTER TABLE ... MODIF The number of threads performing background moves of data parts can be changed by [background_move_pool_size](/docs/en/operations/server-configuration-parameters/settings.md/#background_move_pool_size) setting. -### Dynamic Storage - -This example query shows how to attach a table stored at a URL and configure the -remote storage within the query. The web storage is not configured in the ClickHouse -configuration files; all the settings are in the CREATE/ATTACH query. - -:::note -The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk. -::: - -#### Example dynamic web storage - -:::tip -A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver) -::: - -In this `ATTACH TABLE` query the `UUID` provided matches the directory name of the data, and the endpoint is the URL for the raw GitHub content. - -```sql -# highlight-next-line -ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' -( - price UInt32, - date Date, - postcode1 LowCardinality(String), - postcode2 LowCardinality(String), - type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), - is_new UInt8, - duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), - addr1 String, - addr2 String, - street LowCardinality(String), - locality LowCardinality(String), - town LowCardinality(String), - district LowCardinality(String), - county LowCardinality(String) -) -ENGINE = MergeTree -ORDER BY (postcode1, postcode2, addr1, addr2) - # highlight-start - SETTINGS disk = disk( - type=web, - endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' - ); - # highlight-end -``` - ### Details {#details} In the case of `MergeTree` tables, data is getting to disk in different ways: @@ -1025,7 +978,7 @@ Configuration markup: ``` xml ... -e + s3 true diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index baf4e1999a7..0f818b813bf 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -213,6 +213,10 @@ ORDER BY (postcode1, postcode2, addr1, addr2) In the settings highlighted below notice that the disk of `type=web` is nested within the disk of `type=cache`. +:::note +The example uses `type=web`, but any disk type can be configured as dynamic, even Local disk. Local disks require a path argument to be inside the server config parameter `custom_local_disks_base_directory`, which has no default, so set that also when using local disk. +::: + A combination of config-based configuration and sql-defined configuration is also possible: ```sql @@ -302,6 +306,11 @@ Optional parameters: Google Cloud Storage (GCS) is also supported using the type `s3`. See [GCS backed MergeTree](/docs/en/integrations/gcs). ::: +### Using Plain Storage {#s3-storage} + +There is a disk type `s3_plain`, which provides a write-once storage. Unlike `s3` disk type, it stores data as is, e.g. instead of randomly-generated blob names, it uses normal file names as clickhouse stores files on local disk. So this disk type allows to keeper a static version of the table and can also be used to create backups on it. +Configuration parameters are the same as for `s3` disk type. + ### Using Azure Blob Storage {#azure-blob-storage} `MergeTree` family table engines can store data to [Azure Blob Storage](https://azure.microsoft.com/en-us/services/storage/blobs/) using a disk with type `azure_blob_storage`. @@ -672,7 +681,40 @@ There is a tool `clickhouse-static-files-uploader`, which prepares a data direct This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](/docs/en/sql-reference/statements/create/table.md), [ALTER TABLE](/docs/en/sql-reference/statements/alter/index.md), [RENAME TABLE](/docs/en/sql-reference/statements/rename.md/#misc_operations-rename_table), [DETACH TABLE](/docs/en/sql-reference/statements/detach.md) and [TRUNCATE TABLE](/docs/en/sql-reference/statements/truncate.md). -Web server storage is supported only for the [MergeTree](/docs/en/engines/table-engines/mergetree-family/mergetree.md) and [Log](/docs/en/engines/table-engines/log-family/log.md) engine families. To access the data stored on a `web` disk, use the [storage_policy](/docs/en/engines/table-engines/mergetree-family/mergetree.md/#terms) setting when executing the query. For example, `ATTACH TABLE table_web UUID '{}' (id Int32) ENGINE = MergeTree() ORDER BY id SETTINGS storage_policy = 'web'`. +:::tip +A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver) +::: + +In this `ATTACH TABLE` query the `UUID` provided matches the directory name of the data, and the endpoint is the URL for the raw GitHub content. + +```sql +# highlight-next-line +ATTACH TABLE uk_price_paid UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' +( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), + is_new UInt8, + duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String) +) +ENGINE = MergeTree +ORDER BY (postcode1, postcode2, addr1, addr2) + # highlight-start + SETTINGS disk = disk( + type=web, + endpoint='https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/' + ); + # highlight-end +``` A ready test case. You need to add this configuration to config: From 09e630e02be9ccd19681b34f33e24cea849ca9fd Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 15 Feb 2024 19:00:08 +0100 Subject: [PATCH 0028/1081] Update storing-data.md --- docs/en/operations/storing-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 0f818b813bf..60e33fe2849 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -10,7 +10,7 @@ Data, processed in ClickHouse, is usually stored in the local file system — on 2. The Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)) 3. [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). -Note: to work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine, and to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/AzureBlobStorage.md) table engine. They are different from external storage described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` famility or `Log` family tables. +Note: to work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine, and to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/AzureBlobStorage.md) table engine. They are different from external storage described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` family or `Log` family tables. ## Configuring external storage {#configuring-external-storage} From 7bf42fd86e9599357282f947312c98d2bec1047f Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 16 Feb 2024 11:16:14 +0100 Subject: [PATCH 0029/1081] Fix upgrade check --- src/Core/SettingsChangesHistory.h | 3 ++- src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index c453dd837eb..b6d07d7057a 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -93,7 +93,8 @@ static std::map sett {"async_insert_busy_timeout_decrease_rate", 0.2, 0.2, "The exponential growth rate at which the adaptive asynchronous insert timeout decreases"}, {"split_parts_ranges_into_intersecting_and_non_intersecting_final", true, true, "Allow to split parts ranges into intersecting and non intersecting during FINAL optimization"}, {"split_intersecting_parts_ranges_into_layers_final", true, true, "Allow to split intersecting parts ranges into layers during FINAL optimization"}, - {"azure_max_single_part_copy_size", 256*1024*1024, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage."}}}, + {"azure_max_single_part_copy_size", 256*1024*1024, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage."}, + {"azure_max_inflight_parts_for_one_file", 20, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited."}}}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, {"output_format_arrow_use_signed_indexes_for_dictionary", false, true, "Use signed indexes type for Arrow dictionaries by default as it's recommended"}, diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index d700090303a..74a8949b235 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -98,7 +98,7 @@ void WriteBufferFromAzureBlobStorage::finalizeImpl() auto block_blob_client = blob_container_client->GetBlockBlobClient(blob_path); execWithRetry([&](){ block_blob_client.CommitBlockList(block_ids); }, max_unexpected_write_error_retries); - LOG_DEBUG(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); + LOG_TRACE(log, "Committed {} blocks for blob `{}`", block_ids.size(), blob_path); } void WriteBufferFromAzureBlobStorage::nextImpl() From a11e67d4aae4433dd0f3d8ee46ba40e1cd73fdd5 Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Fri, 16 Feb 2024 16:41:58 +0100 Subject: [PATCH 0030/1081] Make max_insert_delayed_streams_for_parallel_write actually work --- src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index 1fb2393948a..f5494e56049 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -323,6 +323,9 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) if (!temp_part.part) continue; + if (!support_parallel_write && temp_part.part->getDataPartStorage().supportParallelWrite()) + support_parallel_write = true; + BlockIDsType block_id; if constexpr (async_insert) From 458793cc50b92361848c91803d07105a91acea85 Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Fri, 16 Feb 2024 17:13:37 +0100 Subject: [PATCH 0031/1081] Review fix --- src/Storages/MergeTree/MergeTreeSink.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp index 36816904a81..ebc49e22d03 100644 --- a/src/Storages/MergeTree/MergeTreeSink.cpp +++ b/src/Storages/MergeTree/MergeTreeSink.cpp @@ -109,9 +109,14 @@ void MergeTreeSink::consume(Chunk chunk) } } - size_t max_insert_delayed_streams_for_parallel_write = DEFAULT_DELAYED_STREAMS_FOR_PARALLEL_WRITE; - if (!support_parallel_write || settings.max_insert_delayed_streams_for_parallel_write.changed) + size_t max_insert_delayed_streams_for_parallel_write; + + if (settings.max_insert_delayed_streams_for_parallel_write.changed) max_insert_delayed_streams_for_parallel_write = settings.max_insert_delayed_streams_for_parallel_write; + else if (support_parallel_write) + max_insert_delayed_streams_for_parallel_write = DEFAULT_DELAYED_STREAMS_FOR_PARALLEL_WRITE; + else + max_insert_delayed_streams_for_parallel_write = 0; /// In case of too much columns/parts in block, flush explicitly. streams += temp_part.streams.size(); From f7b524465c60b15c85f579ca22c48d4c165bf6f2 Mon Sep 17 00:00:00 2001 From: Alexander Sapin Date: Fri, 16 Feb 2024 17:14:36 +0100 Subject: [PATCH 0032/1081] Followup --- src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp index f5494e56049..3cbdcf5106e 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp @@ -368,9 +368,13 @@ void ReplicatedMergeTreeSinkImpl::consume(Chunk chunk) profile_events_scope.reset(); UInt64 elapsed_ns = watch.elapsed(); - size_t max_insert_delayed_streams_for_parallel_write = DEFAULT_DELAYED_STREAMS_FOR_PARALLEL_WRITE; - if (!support_parallel_write || settings.max_insert_delayed_streams_for_parallel_write.changed) + size_t max_insert_delayed_streams_for_parallel_write; + if (settings.max_insert_delayed_streams_for_parallel_write.changed) max_insert_delayed_streams_for_parallel_write = settings.max_insert_delayed_streams_for_parallel_write; + else if (support_parallel_write) + max_insert_delayed_streams_for_parallel_write = DEFAULT_DELAYED_STREAMS_FOR_PARALLEL_WRITE; + else + max_insert_delayed_streams_for_parallel_write = 0; /// In case of too much columns/parts in block, flush explicitly. streams += temp_part.streams.size(); From 1549725eddb6db299ba0297de21a51411607d2a3 Mon Sep 17 00:00:00 2001 From: unashi Date: Sun, 18 Feb 2024 19:26:12 +0800 Subject: [PATCH 0033/1081] [feature]: allow to attach parts from a different disk --- src/Storages/MergeTree/MergeTreeData.cpp | 13 ++ src/Storages/MergeTree/MergeTreeData.h | 9 + .../MergeTree/MergeTreeDataPartCloner.cpp | 70 ++++++- src/Storages/StorageMergeTree.cpp | 45 +++-- src/Storages/StorageReplicatedMergeTree.cpp | 42 ++-- .../__init__.py | 0 .../configs/remote_servers.xml | 17 ++ .../test_attach_partition_using_copy/test.py | 183 ++++++++++++++++++ 8 files changed, 353 insertions(+), 26 deletions(-) create mode 100644 tests/integration/test_attach_partition_using_copy/__init__.py create mode 100644 tests/integration/test_attach_partition_using_copy/configs/remote_servers.xml create mode 100644 tests/integration/test_attach_partition_using_copy/test.py diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 3ca746a7197..56710b157de 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7085,6 +7085,19 @@ std::pair MergeTreeData::cloneAn this, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, require_part_metadata, params, read_settings, write_settings); } +std::pair MergeTreeData::cloneAndLoadDataPartOnOtherDisk( + const MergeTreeData::DataPartPtr & src_part, + const String & tmp_part_prefix, + const MergeTreePartInfo & dst_part_info, + const StorageMetadataPtr & metadata_snapshot, + const IDataPartStorage::ClonePartParams & params, + const ReadSettings & read_settings, + const WriteSettings & write_settings) +{ + return MergeTreeDataPartCloner::clone( + this, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, require_part_metadata, params, read_settings, write_settings); +} + std::pair MergeTreeData::cloneAndLoadPartOnSameDiskWithDifferentPartitionKey( const MergeTreeData::DataPartPtr & src_part, const MergeTreePartition & new_partition, diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index dfdc22baa8f..a24362f68fc 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -866,6 +866,15 @@ public: ContextPtr local_context, Int64 min_block, Int64 max_block); + + std::pair cloneAndLoadDataPartOnOtherDisk( + const MergeTreeData::DataPartPtr & src_part, + const String & tmp_part_prefix, + const MergeTreePartInfo & dst_part_info, + const StorageMetadataPtr & metadata_snapshot, + const IDataPartStorage::ClonePartParams & params, + const ReadSettings & read_settings, + const WriteSettings & write_settings); static std::pair createPartitionAndMinMaxIndexFromSourcePart( const MergeTreeData::DataPartPtr & src_part, diff --git a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp index 04019d2c665..69b7abacc93 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCloner.cpp @@ -142,6 +142,30 @@ std::shared_ptr hardlinkAllFiles( params); } +std::shared_ptr cloneAllFiles( + MergeTreeData * merge_tree_data, + const DB::ReadSettings & read_settings, + const DB::WriteSettings & write_settings, + const DataPartStoragePtr & storage, + const String & path) +{ + for (const DiskPtr & disk : merge_tree_data->getStoragePolicy()->getDisks()) + { + try{ + return storage->clonePart( + merge_tree_data->getRelativeDataPath(), + path, + disk, + read_settings, + write_settings,{},{}); + }catch(...) { + LOG_TRACE(&Poco::Logger::get("MergeTreeDataPartCloner"), "Clone part on disk {} fail", disk->getName()); + } + } + LOG_FATAL(&Poco::Logger::get("MergeTreeDataPartCloner"), "Clone part on disks all fail"); + throw; +} + std::pair cloneSourcePart( MergeTreeData * merge_tree_data, const MergeTreeData::DataPartPtr & src_part, @@ -165,8 +189,18 @@ std::pair cloneSourcePart( auto src_part_storage = flushPartStorageToDiskIfInMemory( merge_tree_data, src_part, metadata_snapshot, tmp_part_prefix, tmp_dst_part_name, src_flushed_tmp_dir_lock, src_flushed_tmp_part); - - auto dst_part_storage = hardlinkAllFiles(merge_tree_data, read_settings, write_settings, src_part_storage, tmp_dst_part_name, params); + std::shared_ptr dst_part_storage {}; + if (params.copy_instead_of_hardlink) { + dst_part_storage = cloneAllFiles(merge_tree_data, read_settings, write_settings, src_part_storage, tmp_dst_part_name); + } else { + try{ + dst_part_storage = hardlinkAllFiles(merge_tree_data, read_settings, write_settings, src_part_storage, tmp_dst_part_name, params); + } catch(...){ + // Hard link fail. Try copy. + LOG_WARNING(&Poco::Logger::get("MergeTreeDataPartCloner"), "Hard link fail, try tp copy directly. to:{}, path:{}", merge_tree_data->getRelativeDataPath(),tmp_dst_part_name); + dst_part_storage = cloneAllFiles(merge_tree_data, read_settings, write_settings, src_part_storage, tmp_dst_part_name); + } + } if (params.metadata_version_to_write.has_value()) { @@ -275,6 +309,25 @@ std::pair cloneAndHand return std::make_pair(destination_part, std::move(temporary_directory_lock)); } + +std::pair cloneInsteadOfHardlinksAndProjections( + MergeTreeData * merge_tree_data, + const DataPartPtr & src_part, + const StorageMetadataPtr & metadata_snapshot, + const MergeTreePartInfo & dst_part_info, + const String & tmp_part_prefix, + const ReadSettings & read_settings, + const WriteSettings & write_settings, + const IDataPartStorage::ClonePartParams & params) +{ + chassert(!merge_tree_data->isStaticStorage()); + + auto [destination_part, temporary_directory_lock] = cloneSourcePart( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + + return std::make_pair(destination_part, std::move(temporary_directory_lock)); +} + } std::pair MergeTreeDataPartCloner::clone( @@ -288,10 +341,19 @@ std::pair MergeTreeDat const ReadSettings & read_settings, const WriteSettings & write_settings) { - auto [destination_part, temporary_directory_lock] = cloneAndHandleHardlinksAndProjections( + if (params.copy_instead_of_hardlink) + { + auto [destination_part, temporary_directory_lock] = cloneInsteadOfHardlinksAndProjections( merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + return std::make_pair(finalizePart(destination_part, params, require_part_metadata), std::move(temporary_directory_lock)); + } + else + { + auto [destination_part, temporary_directory_lock] = cloneAndHandleHardlinksAndProjections( + merge_tree_data, src_part, metadata_snapshot, dst_part_info, tmp_part_prefix, read_settings, write_settings, params); + return std::make_pair(finalizePart(destination_part, params, require_part_metadata), std::move(temporary_directory_lock)); - return std::make_pair(finalizePart(destination_part, params, require_part_metadata), std::move(temporary_directory_lock)); + } } std::pair MergeTreeDataPartCloner::cloneWithDistinctPartitionExpression( diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 0f75c726bce..0f95fef9c6e 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2118,17 +2118,40 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con else { MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( - src_part, - TMP_PREFIX, - dst_part_info, - my_metadata_snapshot, - clone_params, - local_context->getReadSettings(), - local_context->getWriteSettings()); - dst_parts.emplace_back(std::move(dst_part)); - dst_parts_locks.emplace_back(std::move(part_lock)); + LOG_TRACE(log, "Partition exps are the same:part id: {}; number of disks:{}",dst_part_info.partition_id, this->getStoragePolicy()->getDisks().size()); + bool on_same_disk = false; + for (const DiskPtr & disk : this->getStoragePolicy()->getDisks()) + { + if (disk->getName() == src_part->getDataPartStorage().getDiskName()) + on_same_disk = true; + } + if (on_same_disk) + { + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( + src_part, + TMP_PREFIX, + dst_part_info, + my_metadata_snapshot, + clone_params, + local_context->getReadSettings(), + local_context->getWriteSettings()); + dst_parts.emplace_back(std::move(dst_part)); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + else + { + clone_params.copy_instead_of_hardlink = true; + auto [dst_part, part_lock] = cloneAndLoadDataPartOnOtherDisk( + src_part, + TMP_PREFIX, + dst_part_info, + my_metadata_snapshot, + clone_params, + local_context->getReadSettings(), + local_context->getWriteSettings()); + dst_parts.emplace_back(std::move(dst_part)); + dst_parts_locks.emplace_back(std::move(part_lock)); + } } } diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 6bd57cc4d6d..ba0d27fe612 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8015,17 +8015,37 @@ void StorageReplicatedMergeTree::replacePartitionFrom( { MergeTreePartInfo dst_part_info(partition_id, index, index, src_part->info.level); - auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( - src_part, - TMP_PREFIX, - dst_part_info, - metadata_snapshot, - clone_params, - query_context->getReadSettings(), - query_context->getWriteSettings()); - - dst_parts.emplace_back(dst_part); - dst_parts_locks.emplace_back(std::move(part_lock)); + bool on_same_disk = false; + for (const DiskPtr & disk : this->getStoragePolicy()->getDisks()) + if (disk->getName() == src_part->getDataPartStorage().getDiskName()) + on_same_disk = true; + if (on_same_disk) + { + auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( + src_part, + TMP_PREFIX, + dst_part_info, + metadata_snapshot, + clone_params, + query_context->getReadSettings(), + query_context->getWriteSettings()); + dst_parts.emplace_back(dst_part); + dst_parts_locks.emplace_back(std::move(part_lock)); + } + else + { + clone_params.copy_instead_of_hardlink = true; + auto [dst_part, part_lock] = cloneAndLoadDataPartOnOtherDisk( + src_part, + TMP_PREFIX, + dst_part_info, + metadata_snapshot, + clone_params, + query_context->getReadSettings(), + query_context->getWriteSettings()); + dst_parts.emplace_back(dst_part); + dst_parts_locks.emplace_back(std::move(part_lock)); + } } src_parts.emplace_back(src_part); diff --git a/tests/integration/test_attach_partition_using_copy/__init__.py b/tests/integration/test_attach_partition_using_copy/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_attach_partition_using_copy/configs/remote_servers.xml b/tests/integration/test_attach_partition_using_copy/configs/remote_servers.xml new file mode 100644 index 00000000000..b40730e9f7d --- /dev/null +++ b/tests/integration/test_attach_partition_using_copy/configs/remote_servers.xml @@ -0,0 +1,17 @@ + + + + + true + + replica1 + 9000 + + + replica2 + 9000 + + + + + diff --git a/tests/integration/test_attach_partition_using_copy/test.py b/tests/integration/test_attach_partition_using_copy/test.py new file mode 100644 index 00000000000..effb5708cf3 --- /dev/null +++ b/tests/integration/test_attach_partition_using_copy/test.py @@ -0,0 +1,183 @@ +import pytest +from helpers.cluster import ClickHouseCluster +from helpers.test_tools import assert_eq_with_retry + +cluster = ClickHouseCluster(__file__) + +replica1 = cluster.add_instance( + "replica1", with_zookeeper=True, main_configs=["configs/remote_servers.xml"] +) +replica2 = cluster.add_instance( + "replica2", with_zookeeper=True, main_configs=["configs/remote_servers.xml"] +) + + +@pytest.fixture(scope="module") +def start_cluster(): + try: + cluster.start() + yield cluster + except Exception as ex: + print(ex) + finally: + cluster.shutdown() + + +def cleanup(nodes): + for node in nodes: + node.query("DROP TABLE IF EXISTS source SYNC") + node.query("DROP TABLE IF EXISTS destination SYNC") + + +def create_source_table(node, table_name, replicated): + replica = node.name + engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/{table_name}', '{replica}')" + if replicated + else "MergeTree()" + ) + node.query_with_retry( + """ + ATTACH TABLE {table_name} UUID 'cf712b4f-2ca8-435c-ac23-c4393efe52f7' + ( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), + is_new UInt8, + duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String) + ) + ENGINE = {engine} + ORDER BY (postcode1, postcode2, addr1, addr2) + SETTINGS disk = disk(type = web, endpoint = 'https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/') + """.format( + table_name=table_name, + engine=engine + ) + ) + + + +def create_destination_table(node, table_name, replicated): + replica = node.name + engine = ( + f"ReplicatedMergeTree('/clickhouse/tables/1/{table_name}', '{replica}')" + if replicated + else "MergeTree()" + ) + node.query_with_retry( + """ + CREATE TABLE {table_name} + ( + price UInt32, + date Date, + postcode1 LowCardinality(String), + postcode2 LowCardinality(String), + type Enum8('other' = 0, 'terraced' = 1, 'semi-detached' = 2, 'detached' = 3, 'flat' = 4), + is_new UInt8, + duration Enum8('unknown' = 0, 'freehold' = 1, 'leasehold' = 2), + addr1 String, + addr2 String, + street LowCardinality(String), + locality LowCardinality(String), + town LowCardinality(String), + district LowCardinality(String), + county LowCardinality(String) + ) + ENGINE = {engine} + ORDER BY (postcode1, postcode2, addr1, addr2) + """.format( + table_name=table_name, + engine=engine + ) + ) + +def test_both_mergtree(start_cluster): + create_source_table(replica1, "source", False) + create_destination_table(replica1, "destination", False) + + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", + replica1.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC"), + ) + + assert_eq_with_retry( + replica1, f"SELECT town from destination LIMIT 1", + "SCARBOROUGH" + ) + + cleanup([replica1]) + +def test_all_replicated(start_cluster): + create_source_table(replica1, "source", True) + create_destination_table(replica1, "destination", True) + create_destination_table(replica2, "destination", True) + + replica1.query("SYSTEM SYNC REPLICA destination") + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", + replica1.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC"), + ) + assert_eq_with_retry( + replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC", + replica2.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC"), + ) + + assert_eq_with_retry( + replica1, f"SELECT town from destination LIMIT 1", + "SCARBOROUGH" + ) + + assert_eq_with_retry( + replica2, f"SELECT town from destination LIMIT 1", + "SCARBOROUGH" + ) + + cleanup([replica1, replica2]) + +def test_only_destination_replicated(start_cluster): + create_source_table(replica1, "source", False) + create_destination_table(replica1, "destination", True) + create_destination_table(replica2, "destination", True) + + replica1.query("SYSTEM SYNC REPLICA destination") + replica1.query( + f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source" + ) + + assert_eq_with_retry( + replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", + replica1.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC"), + ) + assert_eq_with_retry( + replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC", + replica2.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC"), + ) + + assert_eq_with_retry( + replica1, f"SELECT town from destination LIMIT 1", + "SCARBOROUGH" + ) + + assert_eq_with_retry( + replica2, f"SELECT town from destination LIMIT 1", + "SCARBOROUGH" + ) + + cleanup([replica1, replica2]) From 8c11f59ba82bd9ae3a322f7a9729c4a5a8644512 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 19 Feb 2024 11:01:37 +0100 Subject: [PATCH 0034/1081] Fix bad link, update disk web description --- docs/en/operations/storing-data.md | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 60e33fe2849..4b0345a3206 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -628,14 +628,9 @@ Cache profile events: ### Using static Web storage (read-only) {#web-storage} -Web storage can be used for read-only purposes. An example use is for hosting sample -data, or for migrating data. - -:::tip -Storage can also be configured temporarily within a query, if a web dataset is not expected -to be used routinely, see [dynamic storage](#dynamic-storage) and skip editing the -configuration file. -::: +This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](/docs/en/sql-reference/statements/create/table.md), [ALTER TABLE](/docs/en/sql-reference/statements/alter/index.md), [RENAME TABLE](/docs/en/sql-reference/statements/rename.md/#misc_operations-rename_table), [DETACH TABLE](/docs/en/sql-reference/statements/detach.md) and [TRUNCATE TABLE](/docs/en/sql-reference/statements/truncate.md). +Web storage can be used for read-only purposes. An example use is for hosting sample data, or for migrating data. +There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. In this sample configuration: - the disk is of type `web` @@ -677,9 +672,11 @@ In this sample configuration:
``` -There is a tool `clickhouse-static-files-uploader`, which prepares a data directory for a given table (`SELECT data_paths FROM system.tables WHERE name = 'table_name'`). For each table you need, you get a directory of files. These files can be uploaded to, for example, a web server with static files. After this preparation, you can load this table into any ClickHouse server via `DiskWeb`. - -This is a read-only disk. Its data is only read and never modified. A new table is loaded to this disk via `ATTACH TABLE` query (see example below). Local disk is not actually used, each `SELECT` query will result in a `http` request to fetch required data. All modification of the table data will result in an exception, i.e. the following types of queries are not allowed: [CREATE TABLE](/docs/en/sql-reference/statements/create/table.md), [ALTER TABLE](/docs/en/sql-reference/statements/alter/index.md), [RENAME TABLE](/docs/en/sql-reference/statements/rename.md/#misc_operations-rename_table), [DETACH TABLE](/docs/en/sql-reference/statements/detach.md) and [TRUNCATE TABLE](/docs/en/sql-reference/statements/truncate.md). +:::tip +Storage can also be configured temporarily within a query, if a web dataset is not expected +to be used routinely, see [dynamic configuration](#dynamic-configuration) and skip editing the +configuration file. +::: :::tip A [demo dataset](https://github.com/ClickHouse/web-tables-demo) is hosted in GitHub. To prepare your own tables for web storage see the tool [clickhouse-static-files-uploader](/docs/en/operations/storing-data.md/#storing-data-on-webserver) From 601b1dfaa14323db28f169b6b193d59ec75e8bfc Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 19 Feb 2024 12:21:52 +0100 Subject: [PATCH 0035/1081] Fix bad link --- docs/en/operations/storing-data.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/storing-data.md b/docs/en/operations/storing-data.md index 4b0345a3206..4f676904375 100644 --- a/docs/en/operations/storing-data.md +++ b/docs/en/operations/storing-data.md @@ -10,7 +10,7 @@ Data, processed in ClickHouse, is usually stored in the local file system — on 2. The Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)) 3. [Azure Blob Storage](https://azure.microsoft.com/en-us/products/storage/blobs). -Note: to work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine, and to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/AzureBlobStorage.md) table engine. They are different from external storage described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` family or `Log` family tables. +Note: to work with data stored on `Amazon S3` disks use [S3](/docs/en/engines/table-engines/integrations/s3.md) table engine, to work with data in the Hadoop Distributed File System — [HDFS](/docs/en/engines/table-engines/integrations/hdfs.md) table engine, and to work with data stored in Azure Blob Storage use [AzureBlobStorage](/docs/en/engines/table-engines/integrations/azureBlobStorage.md) table engine. They are different from external storage described on this page as they allow to read data stored in some general file format (like Parquet), while on this page we are describing storage configuration for ClickHouse `MergeTree` family or `Log` family tables. ## Configuring external storage {#configuring-external-storage} From 80fe3f78d99caeaed733548ca65b6bd466730d51 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 20 Feb 2024 11:12:09 +0800 Subject: [PATCH 0036/1081] [fix] black the python script --- .../test_attach_partition_using_copy/test.py | 92 ++++++++++--------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/tests/integration/test_attach_partition_using_copy/test.py b/tests/integration/test_attach_partition_using_copy/test.py index effb5708cf3..df5378742ae 100644 --- a/tests/integration/test_attach_partition_using_copy/test.py +++ b/tests/integration/test_attach_partition_using_copy/test.py @@ -59,13 +59,11 @@ def create_source_table(node, table_name, replicated): ORDER BY (postcode1, postcode2, addr1, addr2) SETTINGS disk = disk(type = web, endpoint = 'https://raw.githubusercontent.com/ClickHouse/web-tables-demo/main/web/') """.format( - table_name=table_name, - engine=engine + table_name=table_name, engine=engine ) ) - def create_destination_table(node, table_name, replicated): replica = node.name engine = ( @@ -95,89 +93,95 @@ def create_destination_table(node, table_name, replicated): ENGINE = {engine} ORDER BY (postcode1, postcode2, addr1, addr2) """.format( - table_name=table_name, - engine=engine + table_name=table_name, engine=engine ) ) + def test_both_mergtree(start_cluster): create_source_table(replica1, "source", False) create_destination_table(replica1, "destination", False) - replica1.query( - f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source" - ) - + replica1.query(f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source") + assert_eq_with_retry( - replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", - replica1.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC"), + replica1, + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", + replica1.query( + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC" + ), ) - + assert_eq_with_retry( - replica1, f"SELECT town from destination LIMIT 1", - "SCARBOROUGH" + replica1, f"SELECT town from destination LIMIT 1", "SCARBOROUGH" ) cleanup([replica1]) + def test_all_replicated(start_cluster): create_source_table(replica1, "source", True) create_destination_table(replica1, "destination", True) create_destination_table(replica2, "destination", True) replica1.query("SYSTEM SYNC REPLICA destination") - replica1.query( - f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source" + replica1.query(f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source") + + assert_eq_with_retry( + replica1, + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", + replica1.query( + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC" + ), + ) + assert_eq_with_retry( + replica1, + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC", + replica2.query( + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC" + ), ) assert_eq_with_retry( - replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", - replica1.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC"), - ) - assert_eq_with_retry( - replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC", - replica2.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC"), + replica1, f"SELECT town from destination LIMIT 1", "SCARBOROUGH" ) assert_eq_with_retry( - replica1, f"SELECT town from destination LIMIT 1", - "SCARBOROUGH" - ) - - assert_eq_with_retry( - replica2, f"SELECT town from destination LIMIT 1", - "SCARBOROUGH" + replica2, f"SELECT town from destination LIMIT 1", "SCARBOROUGH" ) cleanup([replica1, replica2]) + def test_only_destination_replicated(start_cluster): create_source_table(replica1, "source", False) create_destination_table(replica1, "destination", True) create_destination_table(replica2, "destination", True) replica1.query("SYSTEM SYNC REPLICA destination") - replica1.query( - f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source" + replica1.query(f"ALTER TABLE destination ATTACH PARTITION tuple() FROM source") + + assert_eq_with_retry( + replica1, + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", + replica1.query( + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC" + ), + ) + assert_eq_with_retry( + replica1, + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC", + replica2.query( + f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC" + ), ) assert_eq_with_retry( - replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC", - replica1.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC"), - ) - assert_eq_with_retry( - replica1, f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM source GROUP BY year ORDER BY year ASC", - replica2.query(f"SELECT toYear(date) AS year,round(avg(price)) AS price,bar(price, 0, 1000000, 80) FROM destination GROUP BY year ORDER BY year ASC"), + replica1, f"SELECT town from destination LIMIT 1", "SCARBOROUGH" ) assert_eq_with_retry( - replica1, f"SELECT town from destination LIMIT 1", - "SCARBOROUGH" - ) - - assert_eq_with_retry( - replica2, f"SELECT town from destination LIMIT 1", - "SCARBOROUGH" + replica2, f"SELECT town from destination LIMIT 1", "SCARBOROUGH" ) cleanup([replica1, replica2]) From 8de4a9dbfd32b7e82764a5c8efff3916b5c7ccda Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 20 Feb 2024 11:42:40 +0800 Subject: [PATCH 0037/1081] [fix] delete trailing whitespaces --- src/Storages/MergeTree/MergeTreeData.h | 2 +- src/Storages/StorageMergeTree.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index 5f387385d38..081087acbaa 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -853,7 +853,7 @@ public: const IDataPartStorage::ClonePartParams & params, const ReadSettings & read_settings, const WriteSettings & write_settings); - + std::pair cloneAndLoadDataPartOnOtherDisk( const MergeTreeData::DataPartPtr & src_part, const String & tmp_part_prefix, diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index a2713775e65..47684925182 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2081,7 +2081,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con /// This will generate unique name in scope of current server process. Int64 temp_index = insert_increment.get(); MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); - + IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; LOG_TRACE(log, "Partition exps are the same:part id: {}; number of disks:{}",dst_part_info.partition_id, this->getStoragePolicy()->getDisks().size()); bool on_same_disk = false; From 6437877a712bfaf4a36c180b332a0d6a37981af1 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 20 Feb 2024 20:31:59 +0800 Subject: [PATCH 0038/1081] [fix] add changelog; change some feature logic --- CHANGELOG.md | 1 + src/Storages/MergeTree/MergeTreeData.cpp | 10 +++++++--- src/Storages/StorageMergeTree.cpp | 1 - src/Storages/StorageReplicatedMergeTree.cpp | 3 +-- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b3e5dd709ab..fd4ff90f841 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,6 +42,7 @@ * Added `FROM ` modifier for `SYSTEM SYNC REPLICA LIGHTWEIGHT` query. With the `FROM` modifier ensures we wait for fetches and drop-ranges only for the specified source replicas, as well as any replica not in zookeeper or with an empty source_replica. [#58393](https://github.com/ClickHouse/ClickHouse/pull/58393) ([Jayme Bird](https://github.com/jaymebrd)). * Added setting `update_insert_deduplication_token_in_dependent_materialized_views`. This setting allows to update insert deduplication token with table identifier during insert in dependent materialized views. Closes [#59165](https://github.com/ClickHouse/ClickHouse/issues/59165). [#59238](https://github.com/ClickHouse/ClickHouse/pull/59238) ([Maksim Kita](https://github.com/kitaisreal)). * Added statement `SYSTEM RELOAD ASYNCHRONOUS METRICS` which updates the asynchronous metrics. Mostly useful for testing and development. [#53710](https://github.com/ClickHouse/ClickHouse/pull/53710) ([Robert Schulze](https://github.com/rschu1ze)). +* Attach parts from a different disk `ALTER TABLE destination ATTACH PARTITION tuple() FROM source` where source is an [instant table](https://github.com/ClickHouse/web-tables-demo). [#60112](https://github.com/ClickHouse/ClickHouse/pull/60112)([Unalian](https://github.com/Unalian)). #### Performance Improvement * Coordination for parallel replicas is rewritten for better parallelism and cache locality. It has been tested for linear scalability on hundreds of replicas. It also got support for reading in order. [#57968](https://github.com/ClickHouse/ClickHouse/pull/57968) ([Nikita Taranov](https://github.com/nickitat)). diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 64787d3509b..be1346e0ea2 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -8,6 +8,7 @@ #include #include #include +#include "Common/logger_useful.h" #include #include #include @@ -7170,7 +7171,9 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - throw; + { + LOG_FATAL(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail"); + } } @@ -7301,8 +7304,9 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - throw; - + { + LOG_FATAL( &Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail."); + } if (params.metadata_version_to_write.has_value()) { chassert(!params.keep_metadata_version); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 47684925182..0748ac2dbdf 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2104,7 +2104,6 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con } else { - clone_params.copy_instead_of_hardlink = true; auto [dst_part, part_lock] = cloneAndLoadDataPartOnOtherDisk( src_part, TMP_PREFIX, diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index df261053360..2460d2704c4 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -7933,7 +7933,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom( for (const DiskPtr & disk : this->getStoragePolicy()->getDisks()) if (disk->getName() == src_part->getDataPartStorage().getDiskName()) on_same_disk = true; - if (on_same_disk) + if (on_same_disk && !clone_params.copy_instead_of_hardlink) { auto [dst_part, part_lock] = cloneAndLoadDataPartOnSameDisk( src_part, @@ -7948,7 +7948,6 @@ void StorageReplicatedMergeTree::replacePartitionFrom( } else { - clone_params.copy_instead_of_hardlink = true; auto [dst_part, part_lock] = cloneAndLoadDataPartOnOtherDisk( src_part, TMP_PREFIX, From fc3ebe007b3b5dc905ecbd63ed402547a1cde3a5 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 20 Feb 2024 20:54:32 +0800 Subject: [PATCH 0039/1081] [fix] rm whitespaces --- src/Storages/MergeTree/MergeTreeData.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index be1346e0ea2..18bb0966bfc 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7171,9 +7171,7 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - { LOG_FATAL(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail"); - } } @@ -7304,9 +7302,7 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - { LOG_FATAL( &Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail."); - } if (params.metadata_version_to_write.has_value()) { chassert(!params.keep_metadata_version); From f829a97d9130de5609e07e237b9486847422bc8c Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 20 Feb 2024 21:08:24 +0800 Subject: [PATCH 0040/1081] [fix] rm whitespaces --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 18bb0966bfc..849ceb1b66d 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7302,7 +7302,7 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - LOG_FATAL( &Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail."); + LOG_FATAL(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail."); if (params.metadata_version_to_write.has_value()) { chassert(!params.keep_metadata_version); From 28282eee91add78e5b18202bd38566d1d3797083 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 20 Feb 2024 21:37:09 +0800 Subject: [PATCH 0041/1081] [fix] Add description in partition.md --- docs/en/sql-reference/statements/alter/partition.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/en/sql-reference/statements/alter/partition.md b/docs/en/sql-reference/statements/alter/partition.md index 114b8d5ffe3..277e174bb05 100644 --- a/docs/en/sql-reference/statements/alter/partition.md +++ b/docs/en/sql-reference/statements/alter/partition.md @@ -116,6 +116,8 @@ For the query to run successfully, the following conditions must be met: - Both tables must have the same indices and projections. - Both tables must have the same storage policy. +If both tables have the same storage policy, use hardlink to attach partition. Otherwise, use copying the data to attach partition. + ## REPLACE PARTITION ``` sql From 1731a5a8afba5a48ce01cea20e0cdc1f91316841 Mon Sep 17 00:00:00 2001 From: unashi Date: Wed, 21 Feb 2024 10:55:32 +0800 Subject: [PATCH 0042/1081] [improve]change the integration test test_multiple_disks::test_move_across_policies_not_work to test_move_across_policies_work_for_attach_not_work_for_move --- tests/integration/test_multiple_disks/test.py | 36 +++++++------------ 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/tests/integration/test_multiple_disks/test.py b/tests/integration/test_multiple_disks/test.py index fdd81284b2a..9584ace7f45 100644 --- a/tests/integration/test_multiple_disks/test.py +++ b/tests/integration/test_multiple_disks/test.py @@ -5,6 +5,7 @@ import string import threading import time from multiprocessing.dummy import Pool +from helpers.test_tools import assert_eq_with_retry import pytest from helpers.client import QueryRuntimeException @@ -1745,9 +1746,9 @@ def test_move_while_merge(start_cluster): node1.query(f"DROP TABLE IF EXISTS {name} SYNC") -def test_move_across_policies_does_not_work(start_cluster): +def test_move_across_policies_work_for_attach_not_work_for_move(start_cluster): try: - name = "test_move_across_policies_does_not_work" + name = "test_move_across_policies_work_for_attach_not_work_for_move" node1.query( """ @@ -1783,25 +1784,18 @@ def test_move_across_policies_does_not_work(start_cluster): except QueryRuntimeException: """All parts of partition 'all' are already on disk 'jbod2'.""" - with pytest.raises( - QueryRuntimeException, - match=".*because disk does not belong to storage policy.*", - ): - node1.query( - """ALTER TABLE {name}2 ATTACH PARTITION tuple() FROM {name}""".format( - name=name - ) + node1.query( + """ALTER TABLE {name}2 ATTACH PARTITION tuple() FROM {name}""".format( + name=name ) - - with pytest.raises( - QueryRuntimeException, - match=".*because disk does not belong to storage policy.*", - ): + ) + assert_eq_with_retry( + node1, + """SELECT * FROM {name}2""".format(name=name), node1.query( - """ALTER TABLE {name}2 REPLACE PARTITION tuple() FROM {name}""".format( - name=name - ) - ) + """SELECT * FROM {name}""".format(name=name), + ), + ) with pytest.raises( QueryRuntimeException, @@ -1813,10 +1807,6 @@ def test_move_across_policies_does_not_work(start_cluster): ) ) - assert node1.query( - """SELECT * FROM {name}""".format(name=name) - ).splitlines() == ["1"] - finally: node1.query(f"DROP TABLE IF EXISTS {name} SYNC") node1.query(f"DROP TABLE IF EXISTS {name}2 SYNC") From a34f42ca22c8a4820e4cbcf67cdd48a3589e3879 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sat, 13 Jan 2024 18:48:47 +0300 Subject: [PATCH 0043/1081] Remove lock from the ReadProgressCallback It looks redundant (added in 5ef51ed), though it has "fix tests" in the log message, but CI reports is not available for the commits from that PR [1], so let's try. [1]: https://github.com/ClickHouse/ClickHouse/pull/37543 Also this can be a big problem, since the code under that lock (throttling or quotas with previous implementation that uses boost::atomic_shared_ptr) may sleep. Some numbers: run | time ------------------------|------ max_threads=100 before | 23.1 max_threads=100 after | 15.1 max_threads=4500 before | 4.5 max_threads=4500 after | 2.3 Query: select sum(number) from numbers_mt(2000000) settings max_threads=X, max_block_size = 1 Signed-off-by: Azat Khuzhin --- src/QueryPipeline/ReadProgressCallback.cpp | 2 -- src/QueryPipeline/ReadProgressCallback.h | 1 - tests/performance/small_block_contention.xml | 3 +++ 3 files changed, 3 insertions(+), 3 deletions(-) create mode 100644 tests/performance/small_block_contention.xml diff --git a/src/QueryPipeline/ReadProgressCallback.cpp b/src/QueryPipeline/ReadProgressCallback.cpp index 59843d8791d..e90fc24d882 100644 --- a/src/QueryPipeline/ReadProgressCallback.cpp +++ b/src/QueryPipeline/ReadProgressCallback.cpp @@ -126,8 +126,6 @@ bool ReadProgressCallback::onProgress(uint64_t read_rows, uint64_t read_bytes, c CurrentThread::updatePerformanceCountersIfNeeded(); - std::lock_guard lock(limits_and_quotas_mutex); - /// TODO: Should be done in PipelineExecutor. for (const auto & limits : storage_limits) limits.local_limits.speed_limits.throttle(progress.read_rows, progress.read_bytes, total_rows, total_stopwatch.elapsedMicroseconds(), limits.local_limits.timeout_overflow_mode); diff --git a/src/QueryPipeline/ReadProgressCallback.h b/src/QueryPipeline/ReadProgressCallback.h index 5dbf3344bdf..7dfed9df5da 100644 --- a/src/QueryPipeline/ReadProgressCallback.h +++ b/src/QueryPipeline/ReadProgressCallback.h @@ -41,7 +41,6 @@ private: /// The total number of bytes to read. For progress bar. std::atomic_size_t total_bytes = 0; - std::mutex limits_and_quotas_mutex; Stopwatch total_stopwatch{CLOCK_MONOTONIC_COARSE}; /// Including waiting time bool update_profile_events = true; diff --git a/tests/performance/small_block_contention.xml b/tests/performance/small_block_contention.xml new file mode 100644 index 00000000000..ce1995a0a29 --- /dev/null +++ b/tests/performance/small_block_contention.xml @@ -0,0 +1,3 @@ + + select sum(number) from numbers_mt(200000) settings max_threads=100, max_block_size = 1 format Null + From 9cb1ade3e2967507885f5b0e3deefab2ad40082c Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 22 Feb 2024 17:07:47 +0000 Subject: [PATCH 0044/1081] fix db iterator wait --- src/Common/AsyncLoader.cpp | 2 +- src/Databases/DatabaseOrdinary.cpp | 20 ++++++++++++++++---- src/Databases/IDatabase.h | 17 +---------------- src/Interpreters/InterpreterDropQuery.cpp | 2 +- 4 files changed, 19 insertions(+), 22 deletions(-) diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index 140194e10b4..4a39454ccbb 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -39,7 +39,7 @@ void logAboutProgress(LoggerPtr log, size_t processed, size_t total, AtomicStopw { if (total && (processed % PRINT_MESSAGE_EACH_N_OBJECTS == 0 || watch.compareAndRestart(PRINT_MESSAGE_EACH_N_SECONDS))) { - LOG_INFO(log, "Processed: {}%", static_cast(processed * 1000.0 / total) * 0.1); + LOG_INFO(log, "Processed: {:.1f}%", static_cast(processed) * 100.0 / total); watch.restart(); } } diff --git a/src/Databases/DatabaseOrdinary.cpp b/src/Databases/DatabaseOrdinary.cpp index 644bed23350..40e0fb0a0ed 100644 --- a/src/Databases/DatabaseOrdinary.cpp +++ b/src/Databases/DatabaseOrdinary.cpp @@ -289,10 +289,22 @@ void DatabaseOrdinary::stopLoading() DatabaseTablesIteratorPtr DatabaseOrdinary::getTablesIterator(ContextPtr local_context, const DatabaseOnDisk::FilterByNameFunction & filter_by_table_name) const { - auto result = DatabaseWithOwnTablesBase::getTablesIterator(local_context, filter_by_table_name); - std::scoped_lock lock(mutex); - typeid_cast(*result).setLoadTasks(startup_table); - return result; + // Wait for every table (matching the filter) to be loaded and started up before we make the snapshot. + // It is important, because otherwise table might be: + // - not attached and thus will be missed in the snapshot; + // - not started, which is not good for DDL operations. + LoadTaskPtrs tasks_to_wait; + { + std::lock_guard lock(mutex); + if (!filter_by_table_name) + tasks_to_wait.reserve(startup_table.size()); + for (const auto & [table_name, task] : startup_table) + if (!filter_by_table_name || filter_by_table_name(table_name)) + tasks_to_wait.emplace_back(task); + } + waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), tasks_to_wait); + + return DatabaseWithOwnTablesBase::getTablesIterator(local_context, filter_by_table_name); } void DatabaseOrdinary::alterTable(ContextPtr local_context, const StorageID & table_id, const StorageInMemoryMetadata & metadata) diff --git a/src/Databases/IDatabase.h b/src/Databases/IDatabase.h index ec380fa759d..75662bfebe3 100644 --- a/src/Databases/IDatabase.h +++ b/src/Databases/IDatabase.h @@ -77,17 +77,12 @@ private: Tables tables; Tables::iterator it; - // Tasks to wait before returning a table - using Tasks = std::unordered_map; - Tasks tasks; - protected: DatabaseTablesSnapshotIterator(DatabaseTablesSnapshotIterator && other) noexcept : IDatabaseTablesIterator(std::move(other.database_name)) { size_t idx = std::distance(other.tables.begin(), other.it); std::swap(tables, other.tables); - std::swap(tasks, other.tasks); other.it = other.tables.end(); it = tables.begin(); std::advance(it, idx); @@ -110,17 +105,7 @@ public: const String & name() const override { return it->first; } - const StoragePtr & table() const override - { - if (auto task = tasks.find(it->first); task != tasks.end()) - waitLoad(currentPoolOr(TablesLoaderForegroundPoolId), task->second); - return it->second; - } - - void setLoadTasks(const Tasks & tasks_) - { - tasks = tasks_; - } + const StoragePtr & table() const override { return it->second; } }; using DatabaseTablesIteratorPtr = std::unique_ptr; diff --git a/src/Interpreters/InterpreterDropQuery.cpp b/src/Interpreters/InterpreterDropQuery.cpp index 711100b5de1..72aa4cc63e3 100644 --- a/src/Interpreters/InterpreterDropQuery.cpp +++ b/src/Interpreters/InterpreterDropQuery.cpp @@ -417,7 +417,7 @@ BlockIO InterpreterDropQuery::executeToDatabaseImpl(const ASTDropQuery & query, uuids_to_wait.push_back(table_to_wait); } } - // only if operation is DETACH + // only if operation is DETACH if ((!drop || !truncate) && query.sync) { /// Avoid "some tables are still in use" when sync mode is enabled From 835b47519a7c575d70542e5a37c97dbf5a2b25f9 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 14 Feb 2024 00:44:38 +0100 Subject: [PATCH 0045/1081] impl --- src/Backups/BackupIO_S3.cpp | 1 + src/Coordination/KeeperSnapshotManagerS3.cpp | 1 + src/Disks/ObjectStorages/S3/diskSettings.cpp | 1 + src/IO/S3/Client.cpp | 19 +++++++++--- src/IO/S3/Client.h | 6 ++++ src/IO/S3/Requests.h | 32 ++++++++++++++++++-- src/IO/S3/URI.cpp | 18 +++++------ src/IO/S3/tests/gtest_aws_s3_client.cpp | 25 ++++++++++++++- src/IO/WriteBufferFromS3.cpp | 15 ++++++++- src/IO/WriteBufferFromS3.h | 1 + src/IO/tests/gtest_s3_uri.cpp | 8 +++++ src/IO/tests/gtest_writebuffer_s3.cpp | 21 +++++++------ src/Storages/StorageS3.cpp | 1 + 13 files changed, 119 insertions(+), 30 deletions(-) diff --git a/src/Backups/BackupIO_S3.cpp b/src/Backups/BackupIO_S3.cpp index 9359602a651..2063af2061c 100644 --- a/src/Backups/BackupIO_S3.cpp +++ b/src/Backups/BackupIO_S3.cpp @@ -73,6 +73,7 @@ namespace .use_virtual_addressing = s3_uri.is_virtual_hosted_style, .disable_checksum = local_settings.s3_disable_checksum, .gcs_issue_compose_request = context->getConfigRef().getBool("s3.gcs_issue_compose_request", false), + .is_s3express_bucket = S3::isS3ExpressEndpoint(s3_uri.endpoint), }; return S3::ClientFactory::instance().create( diff --git a/src/Coordination/KeeperSnapshotManagerS3.cpp b/src/Coordination/KeeperSnapshotManagerS3.cpp index 0337a564660..9779a041095 100644 --- a/src/Coordination/KeeperSnapshotManagerS3.cpp +++ b/src/Coordination/KeeperSnapshotManagerS3.cpp @@ -103,6 +103,7 @@ void KeeperSnapshotManagerS3::updateS3Configuration(const Poco::Util::AbstractCo .use_virtual_addressing = new_uri.is_virtual_hosted_style, .disable_checksum = false, .gcs_issue_compose_request = false, + .is_s3express_bucket = S3::isS3ExpressEndpoint(new_uri.endpoint), }; auto client = S3::ClientFactory::instance().create( diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index 4fd4b17aabe..b8688cd3de6 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -97,6 +97,7 @@ std::unique_ptr getClient( .use_virtual_addressing = uri.is_virtual_hosted_style, .disable_checksum = local_settings.s3_disable_checksum, .gcs_issue_compose_request = config.getBool("s3.gcs_issue_compose_request", false), + .is_s3express_bucket = S3::isS3ExpressEndpoint(endpoint), }; return S3::ClientFactory::instance().create( diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index 1b6b245b89a..a75d41df3d1 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -304,6 +304,9 @@ Model::HeadObjectOutcome Client::HeadObject(HeadObjectRequest & request) const request.setApiMode(api_mode); + if (isS3ExpressBucket()) + request.setIsS3ExpressBucket(); + addAdditionalAMZHeadersToCanonicalHeadersList(request, client_configuration.extra_headers); if (auto region = getRegionForBucket(bucket); !region.empty()) @@ -530,7 +533,11 @@ Client::doRequest(RequestType & request, RequestFn request_fn) const addAdditionalAMZHeadersToCanonicalHeadersList(request, client_configuration.extra_headers); const auto & bucket = request.GetBucket(); request.setApiMode(api_mode); - if (client_settings.disable_checksum) + + /// We have to use checksums for S3Express buckets, so the order of checks should be the following + if (client_settings.is_s3express_bucket) + request.setIsS3ExpressBucket(); + else if (client_settings.disable_checksum) request.disableChecksum(); if (auto region = getRegionForBucket(bucket); !region.empty()) @@ -915,9 +922,9 @@ std::unique_ptr ClientFactory::create( // NOLINT std::move(sse_kms_config), credentials_provider, client_configuration, // Client configuration. - Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, - client_settings - ); + client_settings.is_s3express_bucket ? Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::RequestDependent + : Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + client_settings); } PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT @@ -956,6 +963,10 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT return config; } +bool isS3ExpressEndpoint(const std::string & endpoint) +{ + return endpoint.contains("s3express"); +} } } diff --git a/src/IO/S3/Client.h b/src/IO/S3/Client.h index 8da21bd2c2c..c7bc727bf32 100644 --- a/src/IO/S3/Client.h +++ b/src/IO/S3/Client.h @@ -92,6 +92,8 @@ private: std::unordered_map> client_caches; }; +bool isS3ExpressEndpoint(const std::string & endpoint); + struct ClientSettings { bool use_virtual_addressing; @@ -107,6 +109,7 @@ struct ClientSettings /// Ability to enable it preserved since likely it is required for old /// files. bool gcs_issue_compose_request; + bool is_s3express_bucket; }; /// Client that improves the client from the AWS SDK @@ -208,6 +211,9 @@ public: const std::shared_ptr& httpRequest) const override; bool supportsMultiPartCopy() const; + + bool isS3ExpressBucket() const { return client_settings.is_s3express_bucket; } + private: friend struct ::MockS3::Client; diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h index bfb94a5a67e..6f82a0f39d3 100644 --- a/src/IO/S3/Requests.h +++ b/src/IO/S3/Requests.h @@ -21,12 +21,32 @@ #include #include #include +#include +#include + +#include namespace DB::S3 { namespace Model = Aws::S3::Model; +/// Used only for S3Express +namespace RequestChecksum +{ +inline void setPartChecksum(Model::CompletedPart & part, const std::string & checksum) +{ + part.SetChecksumCRC32(checksum); +} + +template +inline void setChecksumAlgorithm(R & request) +{ + if constexpr (requires { request.SetChecksumAlgorithm(Model::ChecksumAlgorithm::CRC32); }) + request.SetChecksumAlgorithm(Model::ChecksumAlgorithm::CRC32); +} +}; + template class ExtendedRequest : public BaseRequest { @@ -49,11 +69,13 @@ public: Aws::String GetChecksumAlgorithmName() const override { + chassert(!is_s3express_bucket || checksum); + /// Return empty string is enough to disable checksums (see /// AWSClient::AddChecksumToRequest [1] for more details). /// /// [1]: https://github.com/aws/aws-sdk-cpp/blob/b0ee1c0d336dbb371c34358b68fba6c56aae2c92/src/aws-cpp-sdk-core/source/client/AWSClient.cpp#L783-L839 - if (!checksum) + if (!is_s3express_bucket && !checksum) return ""; return BaseRequest::GetChecksumAlgorithmName(); } @@ -84,9 +106,12 @@ public: } /// Disable checksum to avoid extra read of the input stream - void disableChecksum() const + void disableChecksum() const { checksum = false; } + + void setIsS3ExpressBucket() { - checksum = false; + is_s3express_bucket = true; + RequestChecksum::setChecksumAlgorithm(*this); } protected: @@ -94,6 +119,7 @@ protected: mutable std::optional uri_override; mutable ApiMode api_mode{ApiMode::AWS}; mutable bool checksum = true; + bool is_s3express_bucket = false; }; class CopyObjectRequest : public ExtendedRequest diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp index 23f59420bfe..062d3b80850 100644 --- a/src/IO/S3/URI.cpp +++ b/src/IO/S3/URI.cpp @@ -35,7 +35,7 @@ URI::URI(const std::string & uri_) /// Case when bucket name represented in domain name of S3 URL. /// E.g. (https://bucket-name.s3.Region.amazonaws.com/key) /// https://docs.aws.amazon.com/AmazonS3/latest/dev/VirtualHosting.html#virtual-hosted-style-access - static const RE2 virtual_hosted_style_pattern(R"((.+)\.(s3|cos|obs|oss|eos)([.\-][a-z0-9\-.:]+))"); + static const RE2 virtual_hosted_style_pattern(R"((.+)\.(s3express[\-a-z0-9]+|s3|cos|obs|oss|eos)([.\-][a-z0-9\-.:]+))"); /// Case when bucket name and key represented in path of S3 URL. /// E.g. (https://s3.Region.amazonaws.com/bucket-name/key) @@ -43,6 +43,7 @@ URI::URI(const std::string & uri_) static const RE2 path_style_pattern("^/([^/]*)/(.*)"); static constexpr auto S3 = "S3"; + static constexpr auto S3EXPRESS = "S3EXPRESS"; static constexpr auto COSN = "COSN"; static constexpr auto COS = "COS"; static constexpr auto OBS = "OBS"; @@ -115,21 +116,16 @@ URI::URI(const std::string & uri_) } boost::to_upper(name); - if (name != S3 && name != COS && name != OBS && name != OSS && name != EOS) + /// For S3Express it will look like s3express-eun1-az1, i.e. contain region and AZ info + if (name != S3 && !name.starts_with(S3EXPRESS) && name != COS && name != OBS && name != OSS && name != EOS) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name)); - if (name == S3) - storage_name = name; - else if (name == OBS) - storage_name = OBS; - else if (name == OSS) - storage_name = OSS; - else if (name == EOS) - storage_name = EOS; - else + if (name == COS || name == COSN) storage_name = COSN; + else + storage_name = name; } else if (re2::RE2::PartialMatch(uri.getPath(), path_style_pattern, &bucket, &key)) { diff --git a/src/IO/S3/tests/gtest_aws_s3_client.cpp b/src/IO/S3/tests/gtest_aws_s3_client.cpp index 33917314bca..8edbe12a22f 100644 --- a/src/IO/S3/tests/gtest_aws_s3_client.cpp +++ b/src/IO/S3/tests/gtest_aws_s3_client.cpp @@ -110,7 +110,8 @@ void testServerSideEncryption( bool disable_checksum, String server_side_encryption_customer_key_base64, DB::S3::ServerSideEncryptionKMSConfig sse_kms_config, - String expected_headers) + String expected_headers, + bool is_s3express_bucket = false) { TestPocoHTTPServer http; @@ -144,6 +145,7 @@ void testServerSideEncryption( .use_virtual_addressing = uri.is_virtual_hosted_style, .disable_checksum = disable_checksum, .gcs_issue_compose_request = false, + .is_s3express_bucket = is_s3express_bucket, }; std::shared_ptr client = DB::S3::ClientFactory::instance().create( @@ -295,4 +297,25 @@ TEST(IOTestAwsS3Client, AppendExtraSSEKMSHeadersWrite) "x-amz-server-side-encryption-context: arn:aws:s3:::bucket_ARN\n"); } +TEST(IOTestAwsS3Client, ChecksumHeaderIsPresentForS3Express) +{ + /// See https://github.com/ClickHouse/ClickHouse/pull/19748 + testServerSideEncryption( + doWriteRequest, + /* disable_checksum= */ true, + "", + {}, + "authorization: ... SignedHeaders=" + "amz-sdk-invocation-id;" + "amz-sdk-request;" + "content-length;" + "content-type;" + "host;" + "x-amz-checksum-crc32;" + "x-amz-content-sha256;" + "x-amz-date;" + "x-amz-sdk-checksum-algorithm, ...\n", + /*is_s3express_bucket=*/true); +} + #endif diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 5bb01050591..a162992278f 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -18,7 +18,9 @@ #include #include +#include #include +#include #include @@ -456,6 +458,14 @@ S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, Pa /// If we don't do it, AWS SDK can mistakenly set it to application/xml, see https://github.com/aws/aws-sdk-cpp/issues/1840 req.SetContentType("binary/octet-stream"); + /// Checksums need to be provided on CompleteMultipartUpload requests, so we calculate then manually and store in multipart_checksums + if (client_ptr->isS3ExpressBucket()) + { + chassert(req.GetChecksumAlgorithm() == Aws::S3::Model::ChecksumAlgorithm::CRC32); + req.SetChecksumCRC32(Aws::Utils::HashingUtils::Base64Encode(Aws::Utils::HashingUtils::CalculateCRC32(*(req.GetBody())))); + multipart_checksums.push_back(req.GetChecksumCRC32()); + } + return req; } @@ -575,7 +585,10 @@ void WriteBufferFromS3::completeMultipartUpload() for (size_t i = 0; i < multipart_tags.size(); ++i) { Aws::S3::Model::CompletedPart part; - multipart_upload.AddParts(part.WithETag(multipart_tags[i]).WithPartNumber(static_cast(i + 1))); + part.WithETag(multipart_tags[i]).WithPartNumber(static_cast(i + 1)); + if (!multipart_checksums.empty()) + S3::RequestChecksum::setPartChecksum(part, multipart_checksums.at(i)); + multipart_upload.AddParts(part); } req.SetMultipartUpload(multipart_upload); diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 5dc269990a1..148cd27f854 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -100,6 +100,7 @@ private: /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts. String multipart_upload_id; std::deque multipart_tags; + std::deque multipart_checksums; bool multipart_upload_finished = false; /// Track that prefinalize() is called only once diff --git a/src/IO/tests/gtest_s3_uri.cpp b/src/IO/tests/gtest_s3_uri.cpp index c088e41f1e8..5bf0dfb962d 100644 --- a/src/IO/tests/gtest_s3_uri.cpp +++ b/src/IO/tests/gtest_s3_uri.cpp @@ -162,6 +162,14 @@ TEST(S3UriTest, validPatterns) ASSERT_EQ("", uri.version_id); ASSERT_EQ(false, uri.is_virtual_hosted_style); } + { + S3::URI uri("https://test-perf-bucket--eun1-az1--x-s3.s3express-eun1-az1.eu-north-1.amazonaws.com/test.csv"); + ASSERT_EQ("https://s3express-eun1-az1.eu-north-1.amazonaws.com", uri.endpoint); + ASSERT_EQ("test-perf-bucket--eun1-az1--x-s3", uri.bucket); + ASSERT_EQ("test.csv", uri.key); + ASSERT_EQ("", uri.version_id); + ASSERT_EQ(true, uri.is_virtual_hosted_style); + } } TEST_P(S3UriTest, invalidPatterns) diff --git a/src/IO/tests/gtest_writebuffer_s3.cpp b/src/IO/tests/gtest_writebuffer_s3.cpp index ae00bb2e9e2..d9cb486c09e 100644 --- a/src/IO/tests/gtest_writebuffer_s3.cpp +++ b/src/IO/tests/gtest_writebuffer_s3.cpp @@ -205,16 +205,17 @@ struct Client : DB::S3::Client { explicit Client(std::shared_ptr mock_s3_store) : DB::S3::Client( - 100, - DB::S3::ServerSideEncryptionKMSConfig(), - std::make_shared("", ""), - GetClientConfiguration(), - Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, - DB::S3::ClientSettings{ - .use_virtual_addressing = true, - .disable_checksum= false, - .gcs_issue_compose_request = false, - }) + 100, + DB::S3::ServerSideEncryptionKMSConfig(), + std::make_shared("", ""), + GetClientConfiguration(), + Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never, + DB::S3::ClientSettings{ + .use_virtual_addressing = true, + .disable_checksum = false, + .gcs_issue_compose_request = false, + .is_s3express_bucket = false, + }) , store(mock_s3_store) {} diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 2d8ef3df1c8..044a1ca5362 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1427,6 +1427,7 @@ void StorageS3::Configuration::connect(const ContextPtr & context) .use_virtual_addressing = url.is_virtual_hosted_style, .disable_checksum = local_settings.s3_disable_checksum, .gcs_issue_compose_request = context->getConfigRef().getBool("s3.gcs_issue_compose_request", false), + .is_s3express_bucket = S3::isS3ExpressEndpoint(url.endpoint), }; auto credentials = Aws::Auth::AWSCredentials(auth_settings.access_key_id, auth_settings.secret_access_key, auth_settings.session_token); From 277e8d965555b4fcd09a755282666bcae36adae6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 26 Feb 2024 14:03:53 +0800 Subject: [PATCH 0046/1081] Fix usage plain metadata type with new configuration option --- src/Disks/DiskType.cpp | 48 +++++++++++++++++++ src/Disks/DiskType.h | 34 +------------ src/Disks/ObjectStorages/IObjectStorage.h | 1 + .../ObjectStorages/MetadataStorageFactory.cpp | 36 +++++++++++--- .../ObjectStorages/MetadataStorageFactory.h | 7 +++ .../ObjectStorages/ObjectStorageFactory.cpp | 43 +++++++++++++---- src/Disks/ObjectStorages/PlainObjectStorage.h | 29 +++++++++++ .../RegisterDiskObjectStorage.cpp | 24 ++-------- src/Disks/ObjectStorages/S3/S3ObjectStorage.h | 21 -------- .../configs/disk_s3.xml | 7 +++ .../test_attach_backup_from_s3_plain/test.py | 25 ++++++---- 11 files changed, 178 insertions(+), 97 deletions(-) create mode 100644 src/Disks/ObjectStorages/PlainObjectStorage.h diff --git a/src/Disks/DiskType.cpp b/src/Disks/DiskType.cpp index 218b6ee7f26..1778ae8025b 100644 --- a/src/Disks/DiskType.cpp +++ b/src/Disks/DiskType.cpp @@ -1,7 +1,27 @@ #include "DiskType.h" +#include +#include namespace DB { +namespace ErrorCodes +{ + extern const int UNKNOWN_ELEMENT_IN_CONFIG; +} + +MetadataStorageType metadataTypeFromString(const String & type) +{ + auto check_type = Poco::toLower(type); + if (check_type == "local") + return MetadataStorageType::Local; + if (check_type == "plain") + return MetadataStorageType::Plain; + if (check_type == "web") + return MetadataStorageType::StaticWeb; + + throw Exception(ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG, + "MetadataStorageFactory: unknown metadata storage type: {}", type); +} bool DataSourceDescription::operator==(const DataSourceDescription & other) const { @@ -14,4 +34,32 @@ bool DataSourceDescription::sameKind(const DataSourceDescription & other) const == std::tie(other.type, other.object_storage_type, other.description); } +std::string DataSourceDescription::toString() const +{ + switch (type) + { + case DataSourceType::Local: + return "local"; + case DataSourceType::RAM: + return "memory"; + case DataSourceType::ObjectStorage: + { + switch (object_storage_type) + { + case ObjectStorageType::S3: + return "s3"; + case ObjectStorageType::HDFS: + return "hdfs"; + case ObjectStorageType::Azure: + return "azure_blob_storage"; + case ObjectStorageType::Local: + return "local_blob_storage"; + case ObjectStorageType::Web: + return "web"; + case ObjectStorageType::None: + return "none"; + } + } + } +} } diff --git a/src/Disks/DiskType.h b/src/Disks/DiskType.h index 15940ea9155..36fe4d83004 100644 --- a/src/Disks/DiskType.h +++ b/src/Disks/DiskType.h @@ -17,7 +17,6 @@ enum class ObjectStorageType { None, S3, - S3_Plain, Azure, HDFS, Web, @@ -30,9 +29,9 @@ enum class MetadataStorageType Local, Plain, StaticWeb, - Memory, }; +MetadataStorageType metadataTypeFromString(const String & type); String toString(DataSourceType data_source_type); struct DataSourceDescription @@ -49,36 +48,7 @@ struct DataSourceDescription bool operator==(const DataSourceDescription & other) const; bool sameKind(const DataSourceDescription & other) const; - std::string toString() const - { - switch (type) - { - case DataSourceType::Local: - return "local"; - case DataSourceType::RAM: - return "memory"; - case DataSourceType::ObjectStorage: - { - switch (object_storage_type) - { - case ObjectStorageType::S3: - return "s3"; - case ObjectStorageType::S3_Plain: - return "s3_plain"; - case ObjectStorageType::HDFS: - return "hdfs"; - case ObjectStorageType::Azure: - return "azure_blob_storage"; - case ObjectStorageType::Local: - return "local_blob_storage"; - case ObjectStorageType::Web: - return "web"; - case ObjectStorageType::None: - return "none"; - } - } - } - } + std::string toString() const; }; } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index 56c269a3fc5..fde97d82ad1 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -218,6 +218,7 @@ public: virtual bool isReadOnly() const { return false; } virtual bool isWriteOnce() const { return false; } + virtual bool isPlain() const { return false; } virtual bool supportParallelWrite() const { return false; } diff --git a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp index 52a0b9ec268..adc1f84372c 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFactory.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFactory.cpp @@ -32,6 +32,35 @@ void MetadataStorageFactory::registerMetadataStorageType(const std::string & met } } +std::string MetadataStorageFactory::getCompatibilityMetadataTypeHint(const ObjectStorageType & type) +{ + switch (type) + { + case ObjectStorageType::S3: + case ObjectStorageType::HDFS: + case ObjectStorageType::Local: + case ObjectStorageType::Azure: + return "local"; + case ObjectStorageType::Web: + return "web"; + default: + return ""; + } +} + +std::string MetadataStorageFactory::getMetadataType( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const std::string & compatibility_type_hint) +{ + if (compatibility_type_hint.empty() && !config.has(config_prefix + ".metadata_type")) + { + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Expected `metadata_type` in config"); + } + + return config.getString(config_prefix + ".metadata_type", compatibility_type_hint); +} + MetadataStoragePtr MetadataStorageFactory::create( const std::string & name, const Poco::Util::AbstractConfiguration & config, @@ -39,12 +68,7 @@ MetadataStoragePtr MetadataStorageFactory::create( ObjectStoragePtr object_storage, const std::string & compatibility_type_hint) const { - if (compatibility_type_hint.empty() && !config.has(config_prefix + ".metadata_type")) - { - throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Expected `metadata_type` in config"); - } - - const auto type = config.getString(config_prefix + ".metadata_type", compatibility_type_hint); + const auto type = getMetadataType(config, config_prefix, compatibility_type_hint); const auto it = registry.find(type); if (it == registry.end()) diff --git a/src/Disks/ObjectStorages/MetadataStorageFactory.h b/src/Disks/ObjectStorages/MetadataStorageFactory.h index 5f61125c599..467cd3cef98 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFactory.h +++ b/src/Disks/ObjectStorages/MetadataStorageFactory.h @@ -25,6 +25,13 @@ public: ObjectStoragePtr object_storage, const std::string & compatibility_type_hint) const; + static std::string getMetadataType( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + const std::string & compatibility_type_hint = ""); + + static std::string getCompatibilityMetadataTypeHint(const ObjectStorageType & type); + private: using Registry = std::unordered_map; Registry registry; diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index b3626135177..6f6ff199902 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -16,8 +16,10 @@ #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD #include #include +#include #include #endif +#include #include #include @@ -32,6 +34,28 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +namespace +{ + template + ObjectStoragePtr createObjectStorage( + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix, + Args && ...args) + { + auto compatibility_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(ObjectStorageType::S3); + auto metadata_type = MetadataStorageFactory::getMetadataType(config, config_prefix, compatibility_hint); + + if (metadataTypeFromString(metadata_type) == MetadataStorageType::Plain) + { + return std::make_shared>(std::forward(args)...); + } + else + { + return std::make_shared(std::forward(args)...); + } + } +} + ObjectStorageFactory & ObjectStorageFactory::instance() { static ObjectStorageFactory factory; @@ -129,12 +153,12 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) auto client = getClient(config, config_prefix, context, *settings); auto key_generator = getKeyGenerator(disk_type, uri, config, config_prefix); - auto object_storage = std::make_shared( - std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); + auto object_storage = createObjectStorage( + config, config_prefix, std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); /// NOTE: should we still perform this check for clickhouse-disks? if (!skip_access_check) - checkS3Capabilities(*object_storage, s3_capabilities, name, uri.key); + checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name, uri.key); return object_storage; }); @@ -165,12 +189,12 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto client = getClient(config, config_prefix, context, *settings); auto key_generator = getKeyGenerator(disk_type, uri, config, config_prefix); - auto object_storage = std::make_shared( + auto object_storage = std::make_shared>( std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); /// NOTE: should we still perform this check for clickhouse-disks? if (!skip_access_check) - checkS3Capabilities(*object_storage, s3_capabilities, name, uri.key); + checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name, uri.key); return object_storage; }); @@ -198,7 +222,7 @@ void registerHDFSObjectStorage(ObjectStorageFactory & factory) context->getSettingsRef().hdfs_replication ); - return std::make_unique(uri, std::move(settings), config); + return createObjectStorage(config, config_prefix, uri, std::move(settings), config); }); } #endif @@ -214,7 +238,8 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) bool /* skip_access_check */) -> ObjectStoragePtr { String container_name = config.getString(config_prefix + ".container_name", "default-container"); - return std::make_unique( + return createObjectStorage( + config, config_prefix, name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), @@ -248,7 +273,7 @@ void registerWebObjectStorage(ObjectStorageFactory & factory) ErrorCodes::BAD_ARGUMENTS, "Bad URI: `{}`. Error: {}", uri, e.what()); } - return std::make_shared(uri, context); + return createObjectStorage(config, config_prefix, uri, context); }); } @@ -266,7 +291,7 @@ void registerLocalObjectStorage(ObjectStorageFactory & factory) loadDiskLocalConfig(name, config, config_prefix, context, object_key_prefix, keep_free_space_bytes); /// keys are mapped to the fs, object_key_prefix is a directory also fs::create_directories(object_key_prefix); - return std::make_shared(object_key_prefix); + return createObjectStorage(config, config_prefix, object_key_prefix); }); } #endif diff --git a/src/Disks/ObjectStorages/PlainObjectStorage.h b/src/Disks/ObjectStorages/PlainObjectStorage.h new file mode 100644 index 00000000000..3a81b85c44b --- /dev/null +++ b/src/Disks/ObjectStorages/PlainObjectStorage.h @@ -0,0 +1,29 @@ +#pragma once +#include + +namespace DB +{ + +/// Do not encode keys, store as-is, and do not require separate disk for metadata. +/// But because of this does not support renames/hardlinks/attrs/... +/// +/// NOTE: This disk has excessive API calls. +template +class PlainObjectStorage : public BaseObjectStorage +{ +public: + template + explicit PlainObjectStorage(Args && ...args) + : BaseObjectStorage(std::forward(args)...) {} + + std::string getName() const override { return "" + BaseObjectStorage::getName(); } + + /// Notes: + /// - supports BACKUP to this disk + /// - does not support INSERT into MergeTree table on this disk + bool isWriteOnce() const override { return true; } + + bool isPlain() const override { return true; } +}; + +} diff --git a/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp b/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp index 383a0b079b5..669a0102951 100644 --- a/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/RegisterDiskObjectStorage.cpp @@ -10,25 +10,6 @@ namespace DB void registerObjectStorages(); void registerMetadataStorages(); -static std::string getCompatibilityMetadataTypeHint(const ObjectStorageType & type) -{ - switch (type) - { - case ObjectStorageType::S3: - case ObjectStorageType::HDFS: - case ObjectStorageType::Local: - case ObjectStorageType::Azure: - return "local"; - case ObjectStorageType::S3_Plain: - return "plain"; - case ObjectStorageType::Web: - return "web"; - case ObjectStorageType::None: - return ""; - } - UNREACHABLE(); -} - void registerDiskObjectStorage(DiskFactory & factory, bool global_skip_access_check) { registerObjectStorages(); @@ -47,7 +28,10 @@ void registerDiskObjectStorage(DiskFactory & factory, bool global_skip_access_ch std::string compatibility_metadata_type_hint; if (!config.has(config_prefix + ".metadata_type")) { - compatibility_metadata_type_hint = getCompatibilityMetadataTypeHint(object_storage->getType()); + if (object_storage->isPlain()) + compatibility_metadata_type_hint = "plain"; + else + compatibility_metadata_type_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(object_storage->getType()); } auto metadata_storage = MetadataStorageFactory::instance().create( diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h index ab0fa5bed68..4ece98c5ec4 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.h +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.h @@ -182,27 +182,6 @@ private: LoggerPtr log; }; -/// Do not encode keys, store as-is, and do not require separate disk for metadata. -/// But because of this does not support renames/hardlinks/attrs/... -/// -/// NOTE: This disk has excessive API calls. -class S3PlainObjectStorage : public S3ObjectStorage -{ -public: - std::string getName() const override { return "S3PlainObjectStorage"; } - - template - explicit S3PlainObjectStorage(Args && ...args) - : S3ObjectStorage("S3PlainObjectStorage", std::forward(args)...) {} - - ObjectStorageType getType() const override { return ObjectStorageType::S3_Plain; } - - /// Notes: - /// - supports BACKUP to this disk - /// - does not support INSERT into MergeTree table on this disk - bool isWriteOnce() const override { return true; } -}; - } #endif diff --git a/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml b/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml index 779e4b6ae21..3166eea7ccb 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml +++ b/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml @@ -8,9 +8,16 @@ minio minio123 + + object_storage + local + plain + local_plain/ +
backup_disk_s3_plain + backup_disk_local_plain diff --git a/tests/integration/test_attach_backup_from_s3_plain/test.py b/tests/integration/test_attach_backup_from_s3_plain/test.py index e575c487b7a..4a8da1e6d66 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/test.py +++ b/tests/integration/test_attach_backup_from_s3_plain/test.py @@ -20,17 +20,27 @@ def start_cluster(): finally: cluster.shutdown() +s3_disk_def = """disk(type=s3_plain, + endpoint='http://minio1:9001/root/data/disks/disk_s3_plain/{backup_name}/', + access_key_id='minio', + secret_access_key='minio123');""" + +local_disk_def = "disk(type=object_storage, object_storage_type = 'local', metadata_type = 'plain'" @pytest.mark.parametrize( - "table_name,backup_name,storage_policy,min_bytes_for_wide_part", + "table_name,backup_name,storage_policy,disk_def,min_bytes_for_wide_part", [ pytest.param( - "compact", "backup_compact", "s3_backup_compact", int(1e9), id="compact" + "compact", "backup_compact_s3", "backup_disk_s3_plain", s3_disk_def, int(1e9), id="compact" ), - pytest.param("wide", "backup_wide", "s3_backup_wide", int(0), id="wide"), + pytest.param("wide", "backup_wide_s3", "backup_disk_s3_plain", s3_disk_def, int(0), id="wide"), + pytest.param( + "compact", "backup_compact_local", "backup_disk_local_plain", local_disk_def, int(1e9), id="compact" + ), + pytest.param("wide", "backup_wide_local", "backup_disk_local_plain", local_disk_def, int(0), id="wide"), ], ) -def test_attach_part(table_name, backup_name, storage_policy, min_bytes_for_wide_part): +def test_attach_part(table_name, backup_name, storage_policy, disk_def, min_bytes_for_wide_part): node.query( f""" -- Catch any errors (NOTE: warnings are ok) @@ -45,7 +55,7 @@ def test_attach_part(table_name, backup_name, storage_policy, min_bytes_for_wide settings min_bytes_for_wide_part={min_bytes_for_wide_part} as select number%5 part, number key from numbers(100); - backup table ordinary_db.{table_name} TO Disk('backup_disk_s3_plain', '{backup_name}') settings deduplicate_files=0; + backup table ordinary_db.{table_name} TO Disk('{storage_policy}', '{backup_name}') settings deduplicate_files=0; drop table ordinary_db.{table_name}; attach table ordinary_db.{table_name} (part UInt8, key UInt64) @@ -53,10 +63,7 @@ def test_attach_part(table_name, backup_name, storage_policy, min_bytes_for_wide order by key partition by part settings max_suspicious_broken_parts=0, - disk=disk(type=s3_plain, - endpoint='http://minio1:9001/root/data/disks/disk_s3_plain/{backup_name}/', - access_key_id='minio', - secret_access_key='minio123'); + disk={disk_def} """ ) From 69b5bd02a915ae044b4116de759d11ae80525dc5 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 26 Feb 2024 09:37:17 +0000 Subject: [PATCH 0047/1081] Automatic style fix --- .../test_attach_backup_from_s3_plain/test.py | 42 ++++++++++++++++--- 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_attach_backup_from_s3_plain/test.py b/tests/integration/test_attach_backup_from_s3_plain/test.py index 4a8da1e6d66..900366b2c9c 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/test.py +++ b/tests/integration/test_attach_backup_from_s3_plain/test.py @@ -20,27 +20,57 @@ def start_cluster(): finally: cluster.shutdown() + s3_disk_def = """disk(type=s3_plain, endpoint='http://minio1:9001/root/data/disks/disk_s3_plain/{backup_name}/', access_key_id='minio', secret_access_key='minio123');""" -local_disk_def = "disk(type=object_storage, object_storage_type = 'local', metadata_type = 'plain'" +local_disk_def = ( + "disk(type=object_storage, object_storage_type = 'local', metadata_type = 'plain'" +) + @pytest.mark.parametrize( "table_name,backup_name,storage_policy,disk_def,min_bytes_for_wide_part", [ pytest.param( - "compact", "backup_compact_s3", "backup_disk_s3_plain", s3_disk_def, int(1e9), id="compact" + "compact", + "backup_compact_s3", + "backup_disk_s3_plain", + s3_disk_def, + int(1e9), + id="compact", ), - pytest.param("wide", "backup_wide_s3", "backup_disk_s3_plain", s3_disk_def, int(0), id="wide"), pytest.param( - "compact", "backup_compact_local", "backup_disk_local_plain", local_disk_def, int(1e9), id="compact" + "wide", + "backup_wide_s3", + "backup_disk_s3_plain", + s3_disk_def, + int(0), + id="wide", + ), + pytest.param( + "compact", + "backup_compact_local", + "backup_disk_local_plain", + local_disk_def, + int(1e9), + id="compact", + ), + pytest.param( + "wide", + "backup_wide_local", + "backup_disk_local_plain", + local_disk_def, + int(0), + id="wide", ), - pytest.param("wide", "backup_wide_local", "backup_disk_local_plain", local_disk_def, int(0), id="wide"), ], ) -def test_attach_part(table_name, backup_name, storage_policy, disk_def, min_bytes_for_wide_part): +def test_attach_part( + table_name, backup_name, storage_policy, disk_def, min_bytes_for_wide_part +): node.query( f""" -- Catch any errors (NOTE: warnings are ok) From f53f43b78d3cf2da6219ea4bdea7018d9811ae54 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 27 Feb 2024 17:33:47 +0800 Subject: [PATCH 0048/1081] Fixes for LocalObjectStorage and plain metadata --- .../Local/LocalObjectStorage.cpp | 37 +++++++++++++++++-- .../ObjectStorages/Local/LocalObjectStorage.h | 4 ++ .../MetadataStorageFromPlainObjectStorage.cpp | 5 +-- .../ObjectStorages/ObjectStorageFactory.cpp | 31 ++++++++++------ src/Disks/ObjectStorages/PlainObjectStorage.h | 6 +++ src/Disks/ObjectStorages/S3/DiskS3Utils.cpp | 6 --- src/Disks/ObjectStorages/S3/DiskS3Utils.h | 1 - .../ObjectStorages/S3/S3ObjectStorage.cpp | 2 + .../configs/disk_s3.xml | 4 +- .../test_attach_backup_from_s3_plain/test.py | 7 ++-- 10 files changed, 71 insertions(+), 32 deletions(-) diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 02700b358e0..51c260cc270 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -31,6 +31,8 @@ LocalObjectStorage::LocalObjectStorage(String key_prefix_) description = *block_device_id; else description = "/"; + + fs::create_directories(getCommonKeyPrefix()); } bool LocalObjectStorage::exists(const StoredObject & object) const @@ -53,6 +55,7 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size); }; + LOG_TEST(log, "Read object: {}", objects[0].remote_path); switch (read_settings.remote_fs_method) { case RemoteFSReadMethod::read: @@ -111,8 +114,8 @@ std::unique_ptr LocalObjectStorage::readObject( /// NOLI if (!file_size) file_size = tryGetSizeFromFilePath(path); - LOG_TEST(log, "Read object: {}", path); - return createReadBufferFromFileBase(path, patchSettings(read_settings), read_hint, file_size); + LOG_TEST(log, "Read object: {}", object.remote_path); + return createReadBufferFromFileBase(object.remote_path, patchSettings(read_settings), read_hint, file_size); } std::unique_ptr LocalObjectStorage::writeObject( /// NOLINT @@ -126,6 +129,7 @@ std::unique_ptr LocalObjectStorage::writeObject( /// NO throw Exception(ErrorCodes::BAD_ARGUMENTS, "LocalObjectStorage doesn't support append to files"); LOG_TEST(log, "Write object: {}", object.remote_path); + fs::create_directories(fs::path(object.remote_path).parent_path()); return std::make_unique(object.remote_path, buf_size); } @@ -157,9 +161,34 @@ void LocalObjectStorage::removeObjectsIfExist(const StoredObjects & objects) removeObjectIfExists(object); } -ObjectMetadata LocalObjectStorage::getObjectMetadata(const std::string & /* path */) const +ObjectMetadata LocalObjectStorage::getObjectMetadata(const std::string & path) const { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Metadata is not supported for LocalObjectStorage"); + ObjectMetadata object_metadata; + LOG_TEST(log, "Getting metadata for path: {}", path); + object_metadata.size_bytes = fs::file_size(path); + object_metadata.last_modified = Poco::Timestamp::fromEpochTime( + std::chrono::duration_cast(fs::last_write_time(path).time_since_epoch()).count()); + return object_metadata; +} + +void LocalObjectStorage::listObjects(const std::string & path, RelativePathsWithMetadata & children, int /* max_keys */) const +{ + for (const auto & entry : fs::directory_iterator(path)) + { + if (entry.is_directory()) + { + listObjects(entry.path(), children, 0); + continue; + } + + auto metadata = getObjectMetadata(entry.path()); + children.emplace_back(entry.path(), std::move(metadata)); + } +} + +bool LocalObjectStorage::existsOrHasAnyChild(const std::string & path) const +{ + return exists(StoredObject(path)); } void LocalObjectStorage::copyObject( // NOLINT diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h index ed5f8c1f537..22429a99c76 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.h +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.h @@ -58,6 +58,10 @@ public: ObjectMetadata getObjectMetadata(const std::string & path) const override; + void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; + + bool existsOrHasAnyChild(const std::string & path) const override; + void copyObject( /// NOLINT const StoredObject & object_from, const StoredObject & object_to, diff --git a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp index b03809f5b39..4b8fc74e956 100644 --- a/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp +++ b/src/Disks/ObjectStorages/MetadataStorageFromPlainObjectStorage.cpp @@ -48,10 +48,7 @@ bool MetadataStorageFromPlainObjectStorage::isDirectory(const std::string & path std::string directory = object_key.serialize(); if (!directory.ends_with('/')) directory += '/'; - - RelativePathsWithMetadata files; - object_storage->listObjects(directory, files, 1); - return !files.empty(); + return object_storage->existsOrHasAnyChild(directory); } uint64_t MetadataStorageFromPlainObjectStorage::getFileSize(const String & path) const diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 6f6ff199902..f64c42c1403 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -36,16 +36,24 @@ namespace ErrorCodes namespace { + bool isPlainStorage( + ObjectStorageType type, + const Poco::Util::AbstractConfiguration & config, + const std::string & config_prefix) + { + auto compatibility_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(type); + auto metadata_type = MetadataStorageFactory::getMetadataType(config, config_prefix, compatibility_hint); + return metadataTypeFromString(metadata_type) == MetadataStorageType::Plain; + } + template ObjectStoragePtr createObjectStorage( + ObjectStorageType type, const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix, Args && ...args) { - auto compatibility_hint = MetadataStorageFactory::getCompatibilityMetadataTypeHint(ObjectStorageType::S3); - auto metadata_type = MetadataStorageFactory::getMetadataType(config, config_prefix, compatibility_hint); - - if (metadataTypeFromString(metadata_type) == MetadataStorageType::Plain) + if (isPlainStorage(type, config, config_prefix)) { return std::make_shared>(std::forward(args)...); } @@ -151,10 +159,10 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto settings = getSettings(config, config_prefix, context); auto client = getClient(config, config_prefix, context, *settings); - auto key_generator = getKeyGenerator(disk_type, uri, config, config_prefix); + auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = createObjectStorage( - config, config_prefix, std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); + ObjectStorageType::S3, config, config_prefix, std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); /// NOTE: should we still perform this check for clickhouse-disks? if (!skip_access_check) @@ -187,7 +195,7 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) auto s3_capabilities = getCapabilitiesFromConfig(config, config_prefix); auto settings = getSettings(config, config_prefix, context); auto client = getClient(config, config_prefix, context, *settings); - auto key_generator = getKeyGenerator(disk_type, uri, config, config_prefix); + auto key_generator = getKeyGenerator(uri, config, config_prefix); auto object_storage = std::make_shared>( std::move(client), std::move(settings), uri, s3_capabilities, key_generator, name); @@ -222,7 +230,7 @@ void registerHDFSObjectStorage(ObjectStorageFactory & factory) context->getSettingsRef().hdfs_replication ); - return createObjectStorage(config, config_prefix, uri, std::move(settings), config); + return createObjectStorage(ObjectStorageType::HDFS, config, config_prefix, uri, std::move(settings), config); }); } #endif @@ -239,8 +247,7 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) { String container_name = config.getString(config_prefix + ".container_name", "default-container"); return createObjectStorage( - config, config_prefix, - name, + ObjectStorageType::Azure, config, config_prefix, name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), container_name); @@ -273,7 +280,7 @@ void registerWebObjectStorage(ObjectStorageFactory & factory) ErrorCodes::BAD_ARGUMENTS, "Bad URI: `{}`. Error: {}", uri, e.what()); } - return createObjectStorage(config, config_prefix, uri, context); + return createObjectStorage(ObjectStorageType::Web, config, config_prefix, uri, context); }); } @@ -291,7 +298,7 @@ void registerLocalObjectStorage(ObjectStorageFactory & factory) loadDiskLocalConfig(name, config, config_prefix, context, object_key_prefix, keep_free_space_bytes); /// keys are mapped to the fs, object_key_prefix is a directory also fs::create_directories(object_key_prefix); - return createObjectStorage(config, config_prefix, object_key_prefix); + return createObjectStorage(ObjectStorageType::Local, config, config_prefix, object_key_prefix); }); } #endif diff --git a/src/Disks/ObjectStorages/PlainObjectStorage.h b/src/Disks/ObjectStorages/PlainObjectStorage.h index 3a81b85c44b..e0907d0b4d8 100644 --- a/src/Disks/ObjectStorages/PlainObjectStorage.h +++ b/src/Disks/ObjectStorages/PlainObjectStorage.h @@ -1,5 +1,6 @@ #pragma once #include +#include namespace DB { @@ -24,6 +25,11 @@ public: bool isWriteOnce() const override { return true; } bool isPlain() const override { return true; } + + ObjectStorageKey generateObjectKeyForPath(const std::string & path) const override + { + return ObjectStorageKey::createAsRelative(BaseObjectStorage::getCommonKeyPrefix(), path); + } }; } diff --git a/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp b/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp index bb7b53b2d22..4b889f89f90 100644 --- a/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp +++ b/src/Disks/ObjectStorages/S3/DiskS3Utils.cpp @@ -15,16 +15,10 @@ namespace ErrorCodes } ObjectStorageKeysGeneratorPtr getKeyGenerator( - String type, const S3::URI & uri, const Poco::Util::AbstractConfiguration & config, const String & config_prefix) { - if (type == "s3_plain") - return createObjectStorageKeysGeneratorAsIsWithPrefix(uri.key); - - chassert(type == "s3"); - bool storage_metadata_write_full_object_key = DiskObjectStorageMetadata::getWriteFullObjectKeySetting(); bool send_metadata = config.getBool(config_prefix + ".send_metadata", false); diff --git a/src/Disks/ObjectStorages/S3/DiskS3Utils.h b/src/Disks/ObjectStorages/S3/DiskS3Utils.h index 29e39d4bc1b..8524a9ccac3 100644 --- a/src/Disks/ObjectStorages/S3/DiskS3Utils.h +++ b/src/Disks/ObjectStorages/S3/DiskS3Utils.h @@ -12,7 +12,6 @@ namespace DB namespace S3 { struct URI; } ObjectStorageKeysGeneratorPtr getKeyGenerator( - String type, const S3::URI & uri, const Poco::Util::AbstractConfiguration & config, const String & config_prefix); diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index 5771eb1ebe0..b2a9ab8fdc3 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -561,6 +561,8 @@ std::unique_ptr S3ObjectStorage::cloneObjectStorage( ObjectStorageKey S3ObjectStorage::generateObjectKeyForPath(const std::string & path) const { + if (!key_generator) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Key generator is not set"); return key_generator->generate(path); } diff --git a/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml b/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml index 3166eea7ccb..2edabc76c8b 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml +++ b/tests/integration/test_attach_backup_from_s3_plain/configs/disk_s3.xml @@ -10,9 +10,9 @@ object_storage - local + local_blob_storage plain - local_plain/ + /local_plain/ diff --git a/tests/integration/test_attach_backup_from_s3_plain/test.py b/tests/integration/test_attach_backup_from_s3_plain/test.py index 4a8da1e6d66..983275cc24f 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/test.py +++ b/tests/integration/test_attach_backup_from_s3_plain/test.py @@ -21,11 +21,11 @@ def start_cluster(): cluster.shutdown() s3_disk_def = """disk(type=s3_plain, - endpoint='http://minio1:9001/root/data/disks/disk_s3_plain/{backup_name}/', + endpoint='http://minio1:9001/root/data/disks/disk_s3_plain/{}/', access_key_id='minio', secret_access_key='minio123');""" -local_disk_def = "disk(type=object_storage, object_storage_type = 'local', metadata_type = 'plain'" +local_disk_def = "disk(type=object_storage, object_storage_type = 'local_blob_storage', metadata_type = 'plain', path = '/local_plain/{}/')" @pytest.mark.parametrize( "table_name,backup_name,storage_policy,disk_def,min_bytes_for_wide_part", @@ -41,6 +41,7 @@ local_disk_def = "disk(type=object_storage, object_storage_type = 'local', metad ], ) def test_attach_part(table_name, backup_name, storage_policy, disk_def, min_bytes_for_wide_part): + disk_definition = disk_def.format(backup_name) node.query( f""" -- Catch any errors (NOTE: warnings are ok) @@ -63,7 +64,7 @@ def test_attach_part(table_name, backup_name, storage_policy, disk_def, min_byte order by key partition by part settings max_suspicious_broken_parts=0, - disk={disk_def} + disk={disk_definition} """ ) From fb38bd139c433ead685028f232e8c4fad5e566d2 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 27 Feb 2024 17:38:02 +0800 Subject: [PATCH 0049/1081] Remove debug logging --- src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 51c260cc270..4ec998a2bb0 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -55,7 +55,6 @@ std::unique_ptr LocalObjectStorage::readObjects( /// NOL return createReadBufferFromFileBase(file_path, modified_settings, read_hint, file_size); }; - LOG_TEST(log, "Read object: {}", objects[0].remote_path); switch (read_settings.remote_fs_method) { case RemoteFSReadMethod::read: @@ -109,10 +108,8 @@ std::unique_ptr LocalObjectStorage::readObject( /// NOLI std::optional read_hint, std::optional file_size) const { - const auto & path = object.remote_path; - if (!file_size) - file_size = tryGetSizeFromFilePath(path); + file_size = tryGetSizeFromFilePath(object.remote_path); LOG_TEST(log, "Read object: {}", object.remote_path); return createReadBufferFromFileBase(object.remote_path, patchSettings(read_settings), read_hint, file_size); From 978fe9fa1a069a231bb52c66b3898c6ce112a215 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 27 Feb 2024 17:43:34 +0800 Subject: [PATCH 0050/1081] Add comments --- src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 4ec998a2bb0..7f34ca48f7f 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -126,7 +126,11 @@ std::unique_ptr LocalObjectStorage::writeObject( /// NO throw Exception(ErrorCodes::BAD_ARGUMENTS, "LocalObjectStorage doesn't support append to files"); LOG_TEST(log, "Write object: {}", object.remote_path); + + /// Unlike real blob storage, in local fs we cannot create a file with non-existing prefix. + /// So let's create it. fs::create_directories(fs::path(object.remote_path).parent_path()); + return std::make_unique(object.remote_path, buf_size); } @@ -185,6 +189,8 @@ void LocalObjectStorage::listObjects(const std::string & path, RelativePathsWith bool LocalObjectStorage::existsOrHasAnyChild(const std::string & path) const { + /// Unlike real object storage, existance of a prefix path can be checked by + /// just checking existence of this prefix directly, so simple exists is enough here. return exists(StoredObject(path)); } From 33788250b1f74384661cd241e2badef82c8fdbf6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 27 Feb 2024 18:07:19 +0800 Subject: [PATCH 0051/1081] Update test.py --- tests/integration/test_attach_backup_from_s3_plain/test.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/integration/test_attach_backup_from_s3_plain/test.py b/tests/integration/test_attach_backup_from_s3_plain/test.py index 3a0fa70a715..c2f8936b82c 100644 --- a/tests/integration/test_attach_backup_from_s3_plain/test.py +++ b/tests/integration/test_attach_backup_from_s3_plain/test.py @@ -26,9 +26,8 @@ s3_disk_def = """disk(type=s3_plain, access_key_id='minio', secret_access_key='minio123');""" -local_disk_def = ( - "disk(type=object_storage, object_storage_type = 'local', metadata_type = 'plain', path = '/local_plain/{}/'" -) +local_disk_def = "disk(type=object_storage, object_storage_type = 'local_blob_storage', metadata_type = 'plain', path = '/local_plain/{}/');" + @pytest.mark.parametrize( "table_name,backup_name,storage_policy,disk_def,min_bytes_for_wide_part", @@ -67,7 +66,6 @@ local_disk_def = ( ), ], ) - def test_attach_part( table_name, backup_name, storage_policy, disk_def, min_bytes_for_wide_part ): From 98b27fd45fbe1109442c2313181ca4e8435e2024 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 27 Feb 2024 23:00:27 +0800 Subject: [PATCH 0052/1081] Fix style check --- src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp | 2 +- src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index 7f34ca48f7f..eba57969580 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -189,7 +189,7 @@ void LocalObjectStorage::listObjects(const std::string & path, RelativePathsWith bool LocalObjectStorage::existsOrHasAnyChild(const std::string & path) const { - /// Unlike real object storage, existance of a prefix path can be checked by + /// Unlike real object storage, existence of a prefix path can be checked by /// just checking existence of this prefix directly, so simple exists is enough here. return exists(StoredObject(path)); } diff --git a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp index b2a9ab8fdc3..eec3a5914fc 100644 --- a/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp +++ b/src/Disks/ObjectStorages/S3/S3ObjectStorage.cpp @@ -48,6 +48,7 @@ namespace ErrorCodes { extern const int S3_ERROR; extern const int BAD_ARGUMENTS; + extern const int LOGICAL_ERROR; } namespace From 416638461fe832673252445d8fabb3fe554eed49 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 27 Feb 2024 15:02:13 +0000 Subject: [PATCH 0053/1081] Try to fix logical error 'Cannot capture column because it has incompatible type' in mapContainsKeyLike --- src/Functions/array/FunctionArrayMapped.h | 4 ++-- .../03002_map_array_functions_with_low_cardinality.reference | 1 + .../03002_map_array_functions_with_low_cardinality.sql | 2 ++ 3 files changed, 5 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.reference create mode 100644 tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql diff --git a/src/Functions/array/FunctionArrayMapped.h b/src/Functions/array/FunctionArrayMapped.h index 49ed9d495e2..136d3481771 100644 --- a/src/Functions/array/FunctionArrayMapped.h +++ b/src/Functions/array/FunctionArrayMapped.h @@ -355,7 +355,7 @@ public: { arrays.emplace_back( column_tuple->getColumnPtr(j), - recursiveRemoveLowCardinality(type_tuple.getElement(j)), + type_tuple.getElement(j), array_with_type_and_name.name + "." + tuple_names[j]); } } @@ -363,7 +363,7 @@ public: { arrays.emplace_back( column_array->getDataPtr(), - recursiveRemoveLowCardinality(array_type->getNestedType()), + array_type->getNestedType(), array_with_type_and_name.name); } diff --git a/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.reference b/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.reference new file mode 100644 index 00000000000..d00491fd7e5 --- /dev/null +++ b/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.reference @@ -0,0 +1 @@ +1 diff --git a/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql b/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql new file mode 100644 index 00000000000..8240a8f93f5 --- /dev/null +++ b/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql @@ -0,0 +1,2 @@ +SELECT mapContainsKeyLike(map('aa', toLowCardinality(1), 'bb', toLowCardinality(2)), toLowCardinality('a%')); + From 7ac453ab4c2f7162c5dc25f29eaf396b670357a6 Mon Sep 17 00:00:00 2001 From: kssenii Date: Tue, 27 Feb 2024 23:06:14 +0800 Subject: [PATCH 0054/1081] Revert "Merge pull request #60436 from nickitat/revert_56864" This reverts commit 8719a601fac863a2c484bcf97339aecdf9e73c5f, reversing changes made to 657857f9828eb46867197c6f7bc8c2444ab1cc3d. --- src/Backups/BackupSettings.cpp | 2 + src/Backups/BackupSettings.h | 6 + src/Common/ErrorCodes.cpp | 1 + src/Interpreters/MutationsInterpreter.cpp | 14 +- src/Interpreters/MutationsInterpreter.h | 1 + .../Optimizations/projectionsCommon.cpp | 2 +- .../MergeTree/DataPartStorageOnDiskBase.cpp | 35 +- .../MergeTree/DataPartStorageOnDiskBase.h | 4 +- src/Storages/MergeTree/IDataPartStorage.h | 4 +- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 62 +- src/Storages/MergeTree/IMergeTreeDataPart.h | 17 +- src/Storages/MergeTree/MergeTask.cpp | 5 +- src/Storages/MergeTree/MergeTreeData.cpp | 37 +- src/Storages/MergeTree/MergeTreeData.h | 9 +- .../MergeTree/MergeTreeDataPartChecksum.h | 2 + src/Storages/MergeTree/MutateTask.cpp | 14 +- .../ReplicatedMergeTreePartCheckThread.cpp | 38 +- .../ReplicatedMergeTreePartCheckThread.h | 4 +- src/Storages/MergeTree/checkDataPart.cpp | 71 ++- src/Storages/MergeTree/checkDataPart.h | 4 +- src/Storages/StorageMergeTree.cpp | 5 +- src/Storages/StorageReplicatedMergeTree.cpp | 3 +- .../System/StorageSystemProjectionParts.cpp | 34 +- .../test_broken_projections/__init__.py | 0 .../config.d/backups.xml | 13 + .../test_broken_projections/test.py | 576 ++++++++++++++++++ .../02117_show_create_table_system.reference | 3 + 27 files changed, 907 insertions(+), 59 deletions(-) create mode 100644 tests/integration/test_broken_projections/__init__.py create mode 100644 tests/integration/test_broken_projections/config.d/backups.xml create mode 100644 tests/integration/test_broken_projections/test.py diff --git a/src/Backups/BackupSettings.cpp b/src/Backups/BackupSettings.cpp index 68d825e9468..51d713f03e1 100644 --- a/src/Backups/BackupSettings.cpp +++ b/src/Backups/BackupSettings.cpp @@ -32,6 +32,8 @@ namespace ErrorCodes M(UInt64, shard_num) \ M(UInt64, replica_num) \ M(Bool, check_parts) \ + M(Bool, check_projection_parts) \ + M(Bool, allow_backup_broken_projections) \ M(Bool, internal) \ M(String, host_id) \ M(OptionalUUID, backup_uuid) diff --git a/src/Backups/BackupSettings.h b/src/Backups/BackupSettings.h index f26b992b348..ec430905f51 100644 --- a/src/Backups/BackupSettings.h +++ b/src/Backups/BackupSettings.h @@ -62,6 +62,12 @@ struct BackupSettings /// Check checksums of the data parts before writing them to a backup. bool check_parts = true; + /// Check checksums of the projection data parts before writing them to a backup. + bool check_projection_parts = true; + + /// Allow to create backup with broken projections. + bool allow_backup_broken_projections = false; + /// Internal, should not be specified by user. /// Whether this backup is a part of a distributed backup created by BACKUP ON CLUSTER. bool internal = false; diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index ca00f2fd513..1ce8997e928 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -592,6 +592,7 @@ M(710, FAULT_INJECTED) \ M(711, FILECACHE_ACCESS_DENIED) \ M(712, TOO_MANY_MATERIALIZED_VIEWS) \ + M(713, BROKEN_PROJECTION) \ M(714, UNEXPECTED_CLUSTER) \ M(715, CANNOT_DETECT_FORMAT) \ M(716, CANNOT_FORGET_PARTITION) \ diff --git a/src/Interpreters/MutationsInterpreter.cpp b/src/Interpreters/MutationsInterpreter.cpp index a3d1b84fdc1..502b961ced8 100644 --- a/src/Interpreters/MutationsInterpreter.cpp +++ b/src/Interpreters/MutationsInterpreter.cpp @@ -342,6 +342,11 @@ bool MutationsInterpreter::Source::hasProjection(const String & name) const return part && part->hasProjection(name); } +bool MutationsInterpreter::Source::hasBrokenProjection(const String & name) const +{ + return part && part->hasBrokenProjection(name); +} + bool MutationsInterpreter::Source::isCompactPart() const { return part && part->getType() == MergeTreeDataPartType::Compact; @@ -807,7 +812,7 @@ void MutationsInterpreter::prepare(bool dry_run) { mutation_kind.set(MutationKind::MUTATE_INDEX_STATISTIC_PROJECTION); const auto & projection = projections_desc.get(command.projection_name); - if (!source.hasProjection(projection.name)) + if (!source.hasProjection(projection.name) || source.hasBrokenProjection(projection.name)) { for (const auto & column : projection.required_columns) dependencies.emplace(column, ColumnDependency::PROJECTION); @@ -994,6 +999,13 @@ void MutationsInterpreter::prepare(bool dry_run) if (!source.hasProjection(projection.name)) continue; + /// Always rebuild broken projections. + if (source.hasBrokenProjection(projection.name)) + { + materialized_projections.insert(projection.name); + continue; + } + if (need_rebuild_projections) { materialized_projections.insert(projection.name); diff --git a/src/Interpreters/MutationsInterpreter.h b/src/Interpreters/MutationsInterpreter.h index eda94190185..4c35ec34b58 100644 --- a/src/Interpreters/MutationsInterpreter.h +++ b/src/Interpreters/MutationsInterpreter.h @@ -126,6 +126,7 @@ public: bool materializeTTLRecalculateOnly() const; bool hasSecondaryIndex(const String & name) const; bool hasProjection(const String & name) const; + bool hasBrokenProjection(const String & name) const; bool isCompactPart() const; void read( diff --git a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp index 8333f5e857b..3009460a468 100644 --- a/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp +++ b/src/Processors/QueryPlan/Optimizations/projectionsCommon.cpp @@ -223,7 +223,7 @@ bool analyzeProjectionCandidate( { const auto & created_projections = part_with_ranges.data_part->getProjectionParts(); auto it = created_projections.find(candidate.projection->name); - if (it != created_projections.end()) + if (it != created_projections.end() && !it->second->is_broken) { projection_parts.push_back(it->second); } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp index 5210d14f3d0..e31d991ef09 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.cpp @@ -335,7 +335,9 @@ void DataPartStorageOnDiskBase::backup( const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const { fs::path part_path_on_disk = fs::path{root_path} / part_dir; fs::path part_path_in_backup = fs::path{path_in_backup} / part_dir; @@ -377,7 +379,7 @@ void DataPartStorageOnDiskBase::backup( bool copy_encrypted = !backup_settings.decrypt_files_from_encrypted_disks; - for (const auto & filepath : files_to_backup) + auto backup_file = [&](const String & filepath) { auto filepath_on_disk = part_path_on_disk / filepath; auto filepath_in_backup = part_path_in_backup / filepath; @@ -385,8 +387,10 @@ void DataPartStorageOnDiskBase::backup( if (files_without_checksums.contains(filepath)) { backup_entries.emplace_back(filepath_in_backup, std::make_unique(disk, filepath_on_disk, read_settings, copy_encrypted)); - continue; + return; } + else if (is_projection_part && allow_backup_broken_projection && !disk->exists(filepath_on_disk)) + return; if (make_temporary_hard_links) { @@ -411,6 +415,31 @@ void DataPartStorageOnDiskBase::backup( backup_entry = wrapBackupEntryWith(std::move(backup_entry), temp_dir_owner); backup_entries.emplace_back(filepath_in_backup, std::move(backup_entry)); + }; + + auto * log = &Poco::Logger::get("DataPartStorageOnDiskBase::backup"); + + for (const auto & filepath : files_to_backup) + { + if (is_projection_part && allow_backup_broken_projection) + { + try + { + backup_file(filepath); + } + catch (Exception & e) + { + if (e.code() != ErrorCodes::FILE_DOESNT_EXIST) + throw; + + LOG_ERROR(log, "Cannot backup file {} of projection part {}. Will try to ignore it", filepath, part_dir); + continue; + } + } + else + { + backup_file(filepath); + } } } diff --git a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h index 52dc850c7fd..75bf3d6f93c 100644 --- a/src/Storages/MergeTree/DataPartStorageOnDiskBase.h +++ b/src/Storages/MergeTree/DataPartStorageOnDiskBase.h @@ -58,7 +58,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const override; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const override; MutableDataPartStoragePtr freeze( const std::string & to, diff --git a/src/Storages/MergeTree/IDataPartStorage.h b/src/Storages/MergeTree/IDataPartStorage.h index 5899ef58cd5..d06d9791a53 100644 --- a/src/Storages/MergeTree/IDataPartStorage.h +++ b/src/Storages/MergeTree/IDataPartStorage.h @@ -223,7 +223,9 @@ public: const ReadSettings & read_settings, bool make_temporary_hard_links, BackupEntries & backup_entries, - TemporaryFilesOnDisks * temp_dirs) const = 0; + TemporaryFilesOnDisks * temp_dirs, + bool is_projection_part, + bool allow_backup_broken_projection) const = 0; /// Creates hardlinks into 'to/dir_path' for every file in data part. /// Callback is called after hardlinks are created, but before 'delete-on-destroy.txt' marker is removed. diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index e06ea5e560c..11ede661f78 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -699,13 +699,14 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks calculateColumnsAndSecondaryIndicesSizesOnDisk(); loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. loadPartitionAndMinMaxIndex(); + bool has_broken_projections = false; if (!parent_part) { loadTTLInfos(); - loadProjections(require_columns_checksums, check_consistency, false /* if_not_loaded */); + loadProjections(require_columns_checksums, check_consistency, has_broken_projections, false /* if_not_loaded */); } - if (check_consistency) + if (check_consistency && !has_broken_projections) checkConsistency(require_columns_checksums); loadDefaultCompressionCodec(); @@ -770,7 +771,7 @@ void IMergeTreeDataPart::addProjectionPart( projection_parts[projection_name] = std::move(projection_part); } -void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded) +void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded) { auto metadata_snapshot = storage.getInMemoryMetadataPtr(); for (const auto & projection : metadata_snapshot->projections) @@ -787,10 +788,34 @@ void IMergeTreeDataPart::loadProjections(bool require_columns_checksums, bool ch else { auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); - part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + + try + { + part->loadColumnsChecksumsIndexes(require_columns_checksums, check_consistency); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + auto message = getCurrentExceptionMessage(true); + LOG_ERROR(&Poco::Logger::get("IMergeTreeDataPart"), + "Cannot load projection {}, will consider it broken. Reason: {}", projection.name, message); + + has_broken_projection = true; + part->setBrokenReason(message, getCurrentExceptionCode()); + } + addProjectionPart(projection.name, std::move(part)); } } + else if (checksums.has(path)) + { + auto part = getProjectionPartBuilder(projection.name).withPartFormatFromDisk().build(); + part->setBrokenReason("Projection directory " + path + " does not exist while loading projections", ErrorCodes::NO_FILE_IN_DATA_PART); + addProjectionPart(projection.name, std::move(part)); + has_broken_projection = true; + } } } @@ -1189,7 +1214,8 @@ void IMergeTreeDataPart::loadChecksums(bool require) /// Check the data while we are at it. LOG_WARNING(storage.log, "Checksums for part {} not found. Will calculate them from data on disk.", name); - checksums = checkDataPart(shared_from_this(), false); + bool noop; + checksums = checkDataPart(shared_from_this(), false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */false); writeChecksums(checksums, {}); bytes_on_disk = checksums.getTotalSizeOnDisk(); @@ -2196,6 +2222,32 @@ std::optional IMergeTreeDataPart::getStreamNameForColumn( return getStreamNameOrHash(stream_name, extension, storage_); } +void IMergeTreeDataPart::markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const +{ + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "There is no projection part '{}'", projection_name); + it->second->setBrokenReason(message, code); +} + +bool IMergeTreeDataPart::hasBrokenProjection(const String & projection_name) const +{ + auto it = projection_parts.find(projection_name); + if (it == projection_parts.end()) + return false; + return it->second->is_broken; +} + +void IMergeTreeDataPart::setBrokenReason(const String & message, int code) const +{ + std::lock_guard lock(broken_reason_mutex); + if (is_broken) + return; + is_broken = true; + exception = message; + exception_code = code; +} + bool isCompactPart(const MergeTreeDataPartPtr & data_part) { return (data_part && data_part->getType() == MergeTreeDataPartType::Compact); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 91c559d30c8..0d7acfab891 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -259,6 +259,12 @@ public: /// Frozen by ALTER TABLE ... FREEZE ... It is used for information purposes in system.parts table. mutable std::atomic is_frozen {false}; + /// If it is a projection part, it can be broken sometimes. + mutable std::atomic is_broken {false}; + mutable std::string exception; + mutable int exception_code = 0; + mutable std::mutex broken_reason_mutex; + /// Indicates that the part was marked Outdated by PartCheckThread because the part was not committed to ZooKeeper mutable bool is_unexpected_local_part = false; @@ -418,9 +424,16 @@ public: void addProjectionPart(const String & projection_name, std::shared_ptr && projection_part); + void markProjectionPartAsBroken(const String & projection_name, const String & message, int code) const; + bool hasProjection(const String & projection_name) const { return projection_parts.contains(projection_name); } - void loadProjections(bool require_columns_checksums, bool check_consistency, bool if_not_loaded = false); + bool hasBrokenProjection(const String & projection_name) const; + + /// Return true, if all projections were loaded successfully and none was marked as broken. + void loadProjections(bool require_columns_checksums, bool check_consistency, bool & has_broken_projection, bool if_not_loaded = false); + + void setBrokenReason(const String & message, int code) const; /// Return set of metadata file names without checksums. For example, /// columns.txt or checksums.txt itself. @@ -580,7 +593,7 @@ protected: const IMergeTreeDataPart * parent_part; String parent_part_name; - std::map> projection_parts; + mutable std::map> projection_parts; mutable PartMetadataManagerPtr metadata_manager; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index df64ae33713..e6ae63da7e3 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -731,8 +731,9 @@ bool MergeTask::MergeProjectionsStage::mergeMinMaxIndexAndPrepareProjections() c MergeTreeData::DataPartsVector projection_parts; for (const auto & part : global_ctx->future_part->parts) { - auto it = part->getProjectionParts().find(projection.name); - if (it != part->getProjectionParts().end()) + auto actual_projection_parts = part->getProjectionParts(); + auto it = actual_projection_parts.find(projection.name); + if (it != actual_projection_parts.end() && !it->second->is_broken) projection_parts.push_back(it->second); } if (projection_parts.size() < global_ctx->future_part->parts.size()) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 6d5e486f6a1..babc593ff62 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -5311,7 +5311,7 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( if (hold_table_lock && !table_lock) table_lock = lockForShare(local_context->getCurrentQueryId(), local_context->getSettingsRef().lock_acquire_timeout); - if (backup_settings.check_parts) + if (backup_settings.check_projection_parts) part->checkConsistencyWithProjections(/* require_part_metadata= */ true); BackupEntries backup_entries_from_part; @@ -5323,7 +5323,8 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + false, false); auto projection_parts = part->getProjectionParts(); for (const auto & [projection_name, projection_part] : projection_parts) @@ -5336,7 +5337,9 @@ MergeTreeData::PartsBackupEntries MergeTreeData::backupParts( read_settings, make_temporary_hard_links, backup_entries_from_part, - &temp_dirs); + &temp_dirs, + projection_part->is_broken, + backup_settings.allow_backup_broken_projections); } if (hold_storage_and_part_ptrs) @@ -7825,21 +7828,39 @@ MovePartsOutcome MergeTreeData::moveParts(const CurrentlyMovingPartsTaggerPtr & bool MergeTreeData::partsContainSameProjections(const DataPartPtr & left, const DataPartPtr & right, String & out_reason) { - if (left->getProjectionParts().size() != right->getProjectionParts().size()) + auto remove_broken_parts_from_consideration = [](auto & parts) + { + std::set broken_projection_parts; + for (const auto & [name, part] : parts) + { + if (part->is_broken) + broken_projection_parts.emplace(name); + } + for (const auto & name : broken_projection_parts) + parts.erase(name); + }; + + auto left_projection_parts = left->getProjectionParts(); + auto right_projection_parts = right->getProjectionParts(); + + remove_broken_parts_from_consideration(left_projection_parts); + remove_broken_parts_from_consideration(right_projection_parts); + + if (left_projection_parts.size() != right_projection_parts.size()) { out_reason = fmt::format( "Parts have different number of projections: {} in part '{}' and {} in part '{}'", - left->getProjectionParts().size(), + left_projection_parts.size(), left->name, - right->getProjectionParts().size(), + right_projection_parts.size(), right->name ); return false; } - for (const auto & [name, _] : left->getProjectionParts()) + for (const auto & [name, _] : left_projection_parts) { - if (!right->hasProjection(name)) + if (!right_projection_parts.contains(name)) { out_reason = fmt::format( "The part '{}' doesn't have projection '{}' while part '{}' does", right->name, name, left->name diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h index ab265715688..c638505604f 100644 --- a/src/Storages/MergeTree/MergeTreeData.h +++ b/src/Storages/MergeTree/MergeTreeData.h @@ -468,8 +468,13 @@ public: struct ProjectionPartsVector { - DataPartsVector projection_parts; DataPartsVector data_parts; + + DataPartsVector projection_parts; + DataPartStateVector projection_parts_states; + + DataPartsVector broken_projection_parts; + DataPartStateVector broken_projection_parts_states; }; /// Returns a copy of the list so that the caller shouldn't worry about locks. @@ -484,7 +489,7 @@ public: const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; /// Same as above but only returns projection parts ProjectionPartsVector getProjectionPartsVectorForInternalUsage( - const DataPartStates & affordable_states, DataPartStateVector * out_states = nullptr) const; + const DataPartStates & affordable_states, MergeTreeData::DataPartStateVector * out_states) const; /// Returns absolutely all parts (and snapshot of their states) diff --git a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h index 837b940e354..d4980a67a43 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartChecksum.h +++ b/src/Storages/MergeTree/MergeTreeDataPartChecksum.h @@ -54,6 +54,8 @@ struct MergeTreeDataPartChecksums bool has(const String & file_name) const { return files.find(file_name) != files.end(); } + bool remove(const String & file_name) { return files.erase(file_name); } + bool empty() const { return files.empty(); } /// Checks that the set of columns and their checksums are the same. If not, throws an exception. diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 1c33f018a5d..6bacce9e2c5 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -573,7 +573,9 @@ static std::set getProjectionsToRecalculate( { bool need_recalculate = materialized_projections.contains(projection.name) - || (!is_full_part_storage && source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && source_part->hasProjection(projection.name) + && !source_part->hasBrokenProjection(projection.name)); if (need_recalculate) projections_to_recalc.insert(&projection); @@ -917,7 +919,8 @@ void finalizeMutatedPart( new_data_part->modification_time = time(nullptr); /// Load rest projections which are hardlinked - new_data_part->loadProjections(false, false, true /* if_not_loaded */); + bool noop; + new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); /// All information about sizes is stored in checksums. /// It doesn't make sense to touch filesystem for sizes. @@ -1500,7 +1503,9 @@ private: bool need_recalculate = ctx->materialized_projections.contains(projection.name) - || (!is_full_part_storage && ctx->source_part->hasProjection(projection.name)); + || (!is_full_part_storage + && ctx->source_part->hasProjection(projection.name) + && !ctx->source_part->hasBrokenProjection(projection.name)); if (need_recalculate) { @@ -1637,8 +1642,9 @@ private: void finalize() { + bool noop; ctx->new_data_part->minmax_idx = std::move(ctx->minmax_idx); - ctx->new_data_part->loadProjections(false, false, true /* if_not_loaded */); + ctx->new_data_part->loadProjections(false, false, noop, true /* if_not_loaded */); ctx->mutating_executor.reset(); ctx->mutating_pipeline.reset(); diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp index 156c41563ec..bc0b4f73a31 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp @@ -63,7 +63,7 @@ void ReplicatedMergeTreePartCheckThread::enqueuePart(const String & name, time_t if (parts_set.contains(name)) return; - LOG_TRACE(log, "Enqueueing {} for check after after {}s", name, delay_to_check_seconds); + LOG_TRACE(log, "Enqueueing {} for check after {}s", name, delay_to_check_seconds); parts_queue.emplace_back(name, std::chrono::steady_clock::now() + std::chrono::seconds(delay_to_check_seconds)); parts_set.insert(name); task->schedule(); @@ -274,7 +274,7 @@ std::pair ReplicatedMergeTreePartCheckThread::findLo return std::make_pair(exists_in_zookeeper, part); } -ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name) +ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const String & part_name, bool throw_on_broken_projection) { ReplicatedCheckResult result; auto [exists_in_zookeeper, part] = findLocalPart(part_name); @@ -341,6 +341,7 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St /// before the ReplicatedMergeTreePartHeader was introduced. String part_path = storage.replica_path + "/parts/" + part_name; String part_znode = zookeeper->get(part_path); + bool is_broken_projection = false; try { @@ -362,8 +363,10 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St checkDataPart( part, - true, - [this] { return need_stop.load(); }); + /* require_checksums */true, + is_broken_projection, + [this] { return need_stop.load(); }, + throw_on_broken_projection); if (need_stop) { @@ -382,14 +385,27 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St if (isRetryableException(std::current_exception())) throw; - tryLogCurrentException(log, __PRETTY_FUNCTION__); + PreformattedMessage message; + if (is_broken_projection) + { + WriteBufferFromOwnString wb; + message = PreformattedMessage::create( + "Part {} has a broken projections. It will be ignored. Broken projections info: {}", + part_name, getCurrentExceptionMessage(false)); + LOG_DEBUG(log, message); + result.action = ReplicatedCheckResult::DoNothing; + } + else + { + tryLogCurrentException(log, __PRETTY_FUNCTION__); - auto message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); - LOG_ERROR(log, message); + message = PreformattedMessage::create("Part {} looks broken. Removing it and will try to fetch.", part_name); + LOG_ERROR(log, message); + result.action = ReplicatedCheckResult::TryFetchMissing; + } /// Part is broken, let's try to find it and fetch. result.status = {part_name, false, message}; - result.action = ReplicatedCheckResult::TryFetchMissing; return result; } @@ -419,12 +435,12 @@ ReplicatedCheckResult ReplicatedMergeTreePartCheckThread::checkPartImpl(const St } -CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after) +CheckResult ReplicatedMergeTreePartCheckThread::checkPartAndFix(const String & part_name, std::optional * recheck_after, bool throw_on_broken_projection) { LOG_INFO(log, "Checking part {}", part_name); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks); - ReplicatedCheckResult result = checkPartImpl(part_name); + ReplicatedCheckResult result = checkPartImpl(part_name, throw_on_broken_projection); switch (result.action) { case ReplicatedCheckResult::None: UNREACHABLE(); @@ -577,7 +593,7 @@ void ReplicatedMergeTreePartCheckThread::run() } std::optional recheck_after; - checkPartAndFix(selected->name, &recheck_after); + checkPartAndFix(selected->name, &recheck_after, /* throw_on_broken_projection */false); if (need_stop) return; diff --git a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h index f2e26b3d324..9091f698546 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h @@ -65,9 +65,9 @@ public: size_t size() const; /// Check part by name - CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr); + CheckResult checkPartAndFix(const String & part_name, std::optional * recheck_after = nullptr, bool throw_on_broken_projection = true); - ReplicatedCheckResult checkPartImpl(const String & part_name); + ReplicatedCheckResult checkPartImpl(const String & part_name, bool throw_on_broken_projection); std::unique_lock pausePartsCheck(); diff --git a/src/Storages/MergeTree/checkDataPart.cpp b/src/Storages/MergeTree/checkDataPart.cpp index 8ae9b54b6e9..0b545beb116 100644 --- a/src/Storages/MergeTree/checkDataPart.cpp +++ b/src/Storages/MergeTree/checkDataPart.cpp @@ -43,6 +43,7 @@ namespace ErrorCodes extern const int NO_FILE_IN_DATA_PART; extern const int NETWORK_ERROR; extern const int SOCKET_TIMEOUT; + extern const int BROKEN_PROJECTION; } @@ -117,7 +118,9 @@ static IMergeTreeDataPart::Checksums checkDataPart( const NameSet & files_without_checksums, const ReadSettings & read_settings, bool require_checksums, - std::function is_cancelled) + std::function is_cancelled, + bool & is_broken_projection, + bool throw_on_broken_projection) { /** Responsibility: * - read list of columns from columns.txt; @@ -126,6 +129,7 @@ static IMergeTreeDataPart::Checksums checkDataPart( */ CurrentMetrics::Increment metric_increment{CurrentMetrics::ReplicatedChecks}; + Poco::Logger * log = &Poco::Logger::get("checkDataPart"); NamesAndTypesList columns_txt; @@ -275,17 +279,55 @@ static IMergeTreeDataPart::Checksums checkDataPart( } } + std::string broken_projections_message; for (const auto & [name, projection] : data_part->getProjectionParts()) { if (is_cancelled()) return {}; auto projection_file = name + ".proj"; - auto projection_checksums = checkDataPart( - projection, *data_part_storage.getProjection(projection_file), - projection->getColumns(), projection->getType(), - projection->getFileNamesWithoutChecksums(), - read_settings, require_checksums, is_cancelled); + if (!throw_on_broken_projection && projection->is_broken) + { + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + } + + IMergeTreeDataPart::Checksums projection_checksums; + try + { + bool noop; + projection_checksums = checkDataPart( + projection, *data_part_storage.getProjection(projection_file), + projection->getColumns(), projection->getType(), + projection->getFileNamesWithoutChecksums(), + read_settings, require_checksums, is_cancelled, noop, /* throw_on_broken_projection */false); + } + catch (...) + { + if (isRetryableException(std::current_exception())) + throw; + + if (!projection->is_broken) + { + LOG_TEST(log, "Marking projection {} as broken ({})", name, projection_file); + projection->setBrokenReason(getCurrentExceptionMessage(false), getCurrentExceptionCode()); + } + + is_broken_projection = true; + if (throw_on_broken_projection) + { + if (!broken_projections_message.empty()) + broken_projections_message += "\n"; + + broken_projections_message += fmt::format( + "Part {} has a broken projection {} (error: {})", + data_part->name, name, getCurrentExceptionMessage(false)); + continue; + } + + projections_on_disk.erase(projection_file); + checksums_txt.remove(projection_file); + } checksums_data.files[projection_file] = IMergeTreeDataPart::Checksums::Checksum( projection_checksums.getTotalSizeOnDisk(), @@ -294,6 +336,11 @@ static IMergeTreeDataPart::Checksums checkDataPart( projections_on_disk.erase(projection_file); } + if (throw_on_broken_projection && !broken_projections_message.empty()) + { + throw Exception(ErrorCodes::BROKEN_PROJECTION, "{}", broken_projections_message); + } + if (require_checksums && !projections_on_disk.empty()) { throw Exception(ErrorCodes::UNEXPECTED_FILE_IN_DATA_PART, @@ -321,7 +368,9 @@ IMergeTreeDataPart::Checksums checkDataPartInMemory(const DataPartInMemoryPtr & IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled) + bool & is_broken_projection, + std::function is_cancelled, + bool throw_on_broken_projection) { if (auto part_in_memory = asInMemoryPart(data_part)) return checkDataPartInMemory(part_in_memory); @@ -363,7 +412,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); }; try @@ -377,7 +428,9 @@ IMergeTreeDataPart::Checksums checkDataPart( data_part->getFileNamesWithoutChecksums(), read_settings, require_checksums, - is_cancelled); + is_cancelled, + is_broken_projection, + throw_on_broken_projection); } catch (...) { diff --git a/src/Storages/MergeTree/checkDataPart.h b/src/Storages/MergeTree/checkDataPart.h index d0e48b6f80a..a01978f4efe 100644 --- a/src/Storages/MergeTree/checkDataPart.h +++ b/src/Storages/MergeTree/checkDataPart.h @@ -10,7 +10,9 @@ namespace DB IMergeTreeDataPart::Checksums checkDataPart( MergeTreeData::DataPartPtr data_part, bool require_checksums, - std::function is_cancelled = []{ return false; }); + bool & is_broken_projection, + std::function is_cancelled = []{ return false; }, + bool throw_on_broken_projection = false); bool isNotEnoughMemoryErrorCode(int code); bool isRetryableException(const std::exception_ptr exception_ptr); diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index e15b308f084..d94a4ff14c0 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2315,11 +2315,12 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { /// If the checksums file is not present, calculate the checksums and write them to disk. static constexpr auto checksums_path = "checksums.txt"; + bool noop; if (part->isStoredOnDisk() && !part->getDataPartStorage().exists(checksums_path)) { try { - auto calculated_checksums = checkDataPart(part, false); + auto calculated_checksums = checkDataPart(part, false, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); calculated_checksums.checkEqual(part->checksums, true); auto & part_mutable = const_cast(*part); @@ -2340,7 +2341,7 @@ std::optional StorageMergeTree::checkDataNext(DataValidationTasksPt { try { - checkDataPart(part, true); + checkDataPart(part, true, noop, /* is_cancelled */[]{ return false; }, /* throw_on_broken_projection */true); return CheckResult(part->name, true, ""); } catch (...) diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp index 1702b52fa35..ee15a26f244 100644 --- a/src/Storages/StorageReplicatedMergeTree.cpp +++ b/src/Storages/StorageReplicatedMergeTree.cpp @@ -8891,12 +8891,11 @@ IStorage::DataValidationTasksPtr StorageReplicatedMergeTree::getCheckTaskList( std::optional StorageReplicatedMergeTree::checkDataNext(DataValidationTasksPtr & check_task_list) { - if (auto part = assert_cast(check_task_list.get())->next()) { try { - return CheckResult(part_check_thread.checkPartAndFix(part->name)); + return part_check_thread.checkPartAndFix(part->name, /* recheck_after */nullptr, /* throw_on_broken_projection */true); } catch (const Exception & ex) { diff --git a/src/Storages/System/StorageSystemProjectionParts.cpp b/src/Storages/System/StorageSystemProjectionParts.cpp index 016705f4e66..b1494f2ba98 100644 --- a/src/Storages/System/StorageSystemProjectionParts.cpp +++ b/src/Storages/System/StorageSystemProjectionParts.cpp @@ -83,7 +83,11 @@ StorageSystemProjectionParts::StorageSystemProjectionParts(const StorageID & tab {"rows_where_ttl_info.expression", std::make_shared(std::make_shared())}, {"rows_where_ttl_info.min", std::make_shared(std::make_shared())}, - {"rows_where_ttl_info.max", std::make_shared(std::make_shared())} + {"rows_where_ttl_info.max", std::make_shared(std::make_shared())}, + + {"is_broken", std::make_shared()}, + {"exception_code", std::make_shared()}, + {"exception", std::make_shared()}, } ) { @@ -272,12 +276,38 @@ void StorageSystemProjectionParts::processNextStorage( add_ttl_info_map(part->ttl_infos.moves_ttl); if (columns_mask[src_index++]) - columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + { + if (part->default_codec) + columns[res_index++]->insert(queryToString(part->default_codec->getCodecDesc())); + else + columns[res_index++]->insertDefault(); + } add_ttl_info_map(part->ttl_infos.recompression_ttl); add_ttl_info_map(part->ttl_infos.group_by_ttl); add_ttl_info_map(part->ttl_infos.rows_where_ttl); + { + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->is_broken.load(std::memory_order_relaxed)); + + if (part->is_broken) + { + std::lock_guard lock(part->broken_reason_mutex); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception_code); + if (columns_mask[src_index++]) + columns[res_index++]->insert(part->exception); + } + else + { + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + if (columns_mask[src_index++]) + columns[res_index++]->insertDefault(); + } + } + /// _state column should be the latest. /// Do not use part->getState*, it can be changed from different thread if (has_state_column) diff --git a/tests/integration/test_broken_projections/__init__.py b/tests/integration/test_broken_projections/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/integration/test_broken_projections/config.d/backups.xml b/tests/integration/test_broken_projections/config.d/backups.xml new file mode 100644 index 00000000000..4da8edffd67 --- /dev/null +++ b/tests/integration/test_broken_projections/config.d/backups.xml @@ -0,0 +1,13 @@ + + + + + local + /var/lib/clickhouse/disks/backups/ + + + + + backups + + diff --git a/tests/integration/test_broken_projections/test.py b/tests/integration/test_broken_projections/test.py new file mode 100644 index 00000000000..4a4690a5d0a --- /dev/null +++ b/tests/integration/test_broken_projections/test.py @@ -0,0 +1,576 @@ +import time +import pytest +import logging +import string +import random +from helpers.cluster import ClickHouseCluster + +cluster = ClickHouseCluster(__file__) + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["config.d/backups.xml"], + stay_alive=True, + with_zookeeper=True, + ) + + logging.info("Starting cluster...") + cluster.start() + logging.info("Cluster started") + + yield cluster + finally: + cluster.shutdown() + + +def create_table(node, table, replica, data_prefix="", aggressive_merge=True): + if data_prefix == "": + data_prefix = table + + if aggressive_merge: + vertical_merge_algorithm_min_rows_to_activate = 1 + vertical_merge_algorithm_min_columns_to_activate = 1 + max_parts_to_merge_at_once = 3 + else: + vertical_merge_algorithm_min_rows_to_activate = 100000 + vertical_merge_algorithm_min_columns_to_activate = 100 + max_parts_to_merge_at_once = 3 + + node.query( + f""" + DROP TABLE IF EXISTS {table} SYNC; + CREATE TABLE {table} + ( + a String, + b String, + c Int64, + d Int64, + e Int64, + PROJECTION proj1 + ( + SELECT c ORDER BY d + ), + PROJECTION proj2 + ( + SELECT d ORDER BY c + ) + ) + ENGINE = ReplicatedMergeTree('/test_broken_projection_{data_prefix}/data/', '{replica}') ORDER BY a + SETTINGS min_bytes_for_wide_part = 0, + max_parts_to_merge_at_once={max_parts_to_merge_at_once}, + enable_vertical_merge_algorithm=0, + vertical_merge_algorithm_min_rows_to_activate = {vertical_merge_algorithm_min_rows_to_activate}, + vertical_merge_algorithm_min_columns_to_activate = {vertical_merge_algorithm_min_columns_to_activate}, + compress_primary_key=0; + """ + ) + + +def insert(node, table, offset, size): + node.query( + f""" + INSERT INTO {table} + SELECT number, number, number, number, number%2 FROM numbers({offset}, {size}) + SETTINGS insert_keeper_fault_injection_probability=0.0; + """ + ) + + +def get_parts(node, table): + return ( + node.query( + f""" + SELECT name + FROM system.parts + WHERE table='{table}' AND database=currentDatabase() AND active = 1 + ORDER BY name;" + """ + ) + .strip() + .split("\n") + ) + + +def bash(node, command): + node.exec_in_container(["bash", "-c", command], privileged=True, user="root") + + +def break_projection(node, table, part, parent_part, break_type): + part_path = node.query( + f""" + SELECT path + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + AND parent_name='{parent_part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + + if break_type == "data": + bash(node, f"rm '{part_path}/d.bin'") + bash(node, f"rm '{part_path}/c.bin'") + elif break_type == "metadata": + bash(node, f"rm '{part_path}/columns.txt'") + elif break_type == "part": + bash(node, f"rm -r '{part_path}'") + + +def break_part(node, table, part): + part_path = node.query( + f""" + SELECT path + FROM system.parts + WHERE table='{table}' + AND database=currentDatabase() + AND active=1 + AND part_name='{part}' + ORDER BY modification_time DESC + LIMIT 1; + """ + ).strip() + + node.query( + f"select throwIf(substring('{part_path}', 1, 1) != '/', 'Path is relative: {part_path}')" + ) + bash(node, f"rm '{part_path}/columns.txt'") + + +def get_broken_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, errors.name FROM + ( + SELECT parent_name, name, exception_code + FROM system.projection_parts + WHERE table='{table}' + AND database=currentDatabase() + AND is_broken = 1 + ) AS parts_info + INNER JOIN system.errors AS errors + ON parts_info.exception_code = errors.code + ORDER BY parent_name, name + """ + ).strip() + + +def get_projections_info(node, table): + return node.query( + f""" + SELECT parent_name, name, is_broken + FROM system.projection_parts + WHERE table='{table}' + AND active = 1 + AND database=currentDatabase() + ORDER BY parent_name, name + """ + ).strip() + + +def optimize(node, table, final, no_wait): + query = f"OPTIMIZE TABLE {table}" + if final: + query += " FINAL" + if no_wait: + query += " SETTINGS alter_sync=0" + node.query(query) + + +def reattach(node, table): + node.query( + f""" + DETACH TABLE {table}; + ATTACH TABLE {table}; + """ + ) + + +def materialize_projection(node, table, proj): + node.query( + f"ALTER TABLE {table} MATERIALIZE PROJECTION {proj} SETTINGS mutations_sync=2" + ) + + +def check_table_full(node, table): + return node.query( + f"CHECK TABLE {table} SETTINGS check_query_single_value_result = 0;" + ).strip() + + +def random_str(length=6): + alphabet = string.ascii_lowercase + string.digits + return "".join(random.SystemRandom().choice(alphabet) for _ in range(length)) + + +def check(node, table, check_result, expect_broken_part="", expected_error=""): + if expect_broken_part == "proj1": + assert expected_error in node.query_and_get_error( + f"SELECT c FROM '{table}' WHERE d == 12 ORDER BY c" + ) + else: + query_id = node.query( + f"SELECT queryID() FROM (SELECT c FROM '{table}' WHERE d == 12 ORDER BY c)" + ).strip() + node.query("SYSTEM FLUSH LOGS") + res = node.query( + f""" + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE query_id='{query_id}' AND type='QueryFinish' + """ + ) + if res == "": + res = node.query( + """ + SELECT query_id, query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log ORDER BY query_start_time_microseconds DESC + """ + ) + print(f"LOG: {res}") + assert False + assert "proj1" in res + + if expect_broken_part == "proj2": + assert expected_error in node.query_and_get_error( + f"SELECT d FROM '{table}' WHERE c == 12 ORDER BY d" + ) + else: + query_id = node.query( + f"SELECT queryID() FROM (SELECT d FROM '{table}' WHERE c == 12 ORDER BY d)" + ).strip() + node.query("SYSTEM FLUSH LOGS") + res = node.query( + f""" + SELECT query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log + WHERE query_id='{query_id}' AND type='QueryFinish' + """ + ) + if res == "": + res = node.query( + """ + SELECT query_id, query, splitByChar('.', arrayJoin(projections))[-1] + FROM system.query_log ORDER BY query_start_time_microseconds DESC + """ + ) + print(f"LOG: {res}") + assert False + assert "proj2" in res + + assert check_result == int(node.query(f"CHECK TABLE {table}")) + + +def test_broken_ignored(cluster): + node = cluster.instances["node"] + + table_name = "test1" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + # Break metadata (columns.txt) file of projection 'proj1' + break_projection(node, table_name, "proj1", "all_2_2_0", "metadata") + + # Do select and after "check table" query. + # Select works because it does not read columns.txt. + # But expect check table result as 0. + check(node, table_name, 0) + + # Projection 'proj1' from part all_2_2_0 will now appear in broken parts info + # because it was marked broken during "check table" query. + assert "all_2_2_0\tproj1\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + + # Check table query will also show a list of parts which have broken projections. + assert "all_2_2_0" in check_table_full(node, table_name) + + # Break data file of projection 'proj2' for part all_2_2_0 + break_projection(node, table_name, "proj2", "all_2_2_0", "data") + + # It will not yet appear in broken projections info. + assert "proj2" not in get_broken_projections_info(node, table_name) + + # Select now fails with error "File doesn't exist" + check(node, table_name, 0, "proj2", "FILE_DOESNT_EXIST") + + # Projection 'proj2' from part all_2_2_0 will now appear in broken parts info. + assert "all_2_2_0\tproj2\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + # Second select works, because projection is now marked as broken. + check(node, table_name, 0) + + # Break data file of projection 'proj2' for part all_3_3_0 + break_projection(node, table_name, "proj2", "all_3_3_0", "data") + + # It will not yet appear in broken projections info. + assert "all_3_3_0" not in get_broken_projections_info(node, table_name) + + insert(node, table_name, 20, 5) + insert(node, table_name, 25, 5) + + # Part all_3_3_0 has 'proj' and 'proj2' projections, but 'proj2' is broken and server does NOT know it yet. + # Parts all_4_4_0 and all_5_5_0 have both non-broken projections. + # So a merge will be create for future part all_3_5_1. + # During merge it will fail to read from 'proj2' of part all_3_3_0 and proj2 will be marked broken. + # Merge will be retried and on second attempt it will succeed. + # The result part all_3_5_1 will have only 1 projection - 'proj', because + # it will skip 'proj2' as it will see that one part does not have it anymore in the set of valid projections. + optimize(node, table_name, 0, 1) + time.sleep(5) + + # table_uuid=node.query(f"SELECT uuid FROM system.tables WHERE table='{table_name}' and database=currentDatabase()").strip() + # assert 0 < int( + # node.query( + # f""" + # SYSTEM FLUSH LOGS; + # SELECT count() FROM system.text_log + # WHERE level='Error' + # AND logger_name='MergeTreeBackgroundExecutor' + # AND message like 'Exception while executing background task %{table_uuid}:all_3_5_1%%Cannot open file%proj2.proj/c.bin%' + # """) + # ) + + assert "all_3_3_0" in get_broken_projections_info(node, table_name) + check(node, table_name, 0) + + +def test_materialize_broken_projection(cluster): + node = cluster.instances["node"] + + table_name = "test2" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + break_projection(node, table_name, "proj1", "all_1_1_0", "metadata") + reattach(node, table_name) + + assert "all_1_1_0\tproj1\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj1" in check_table_full( + node, table_name + ) + + break_projection(node, table_name, "proj2", "all_1_1_0", "data") + reattach(node, table_name) + + assert "all_1_1_0\tproj2\tFILE_DOESNT_EXIST" in get_broken_projections_info( + node, table_name + ) + assert "Part all_1_1_0 has a broken projection proj2" in check_table_full( + node, table_name + ) + + materialize_projection(node, table_name, "proj1") + + assert "has a broken projection" not in check_table_full(node, table_name) + + +def test_broken_ignored_replicated(cluster): + node = cluster.instances["node"] + + table_name = "test3" + table_name2 = "test3_replica" + create_table(node, table_name, 1) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + check(node, table_name, 1) + + create_table(node, table_name2, 2, table_name) + check(node, table_name2, 1) + + break_projection(node, table_name, "proj1", "all_0_0_0", "data") + assert "Part all_0_0_0 has a broken projection proj1" in check_table_full( + node, table_name + ) + + break_part(node, table_name, "all_0_0_0") + node.query(f"SYSTEM SYNC REPLICA {table_name}") + assert "has a broken projection" not in check_table_full(node, table_name) + + +def get_random_string(string_length=8): + alphabet = string.ascii_letters + string.digits + return "".join((random.choice(alphabet) for _ in range(string_length))) + + +def test_broken_projections_in_backups_1(cluster): + node = cluster.instances["node"] + + table_name = "test4" + create_table(node, table_name, 1, aggressive_merge=False, data_prefix=table_name) + + node.query("SYSTEM STOP MERGES") + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + check(node, table_name, 1) + + break_projection(node, table_name, "proj1", "all_2_2_0", "data") + check(node, table_name, 0, "proj1", "FILE_DOESNT_EXIST") + + assert "all_2_2_0\tproj1\tNO_FILE_IN_DATA_PART" in get_broken_projections_info( + node, table_name + ) + + backup_name = f"b1-{get_random_string()}" + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', '{backup_name}'); + """ + ) + + node.query("SYSTEM STOP MERGES") + + check(node, table_name, 1) + assert "" == get_broken_projections_info(node, table_name) + + +def test_broken_projections_in_backups_2(cluster): + node = cluster.instances["node"] + + table_name = "test5" + create_table(node, table_name, 1, aggressive_merge=False, data_prefix=table_name) + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + check(node, table_name, 1) + break_projection(node, table_name, "proj2", "all_2_2_0", "part") + check(node, table_name, 0, "proj2", "ErrnoException") + + assert "all_2_2_0\tproj2\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + assert "FILE_DOESNT_EXIST" in node.query_and_get_error( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', 'b2') + """ + ) + + materialize_projection(node, table_name, "proj2") + check(node, table_name, 1) + + backup_name = f"b3-{get_random_string()}" + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', '{backup_name}'); + """ + ) + check(node, table_name, 1) + + +def test_broken_projections_in_backups_3(cluster): + node = cluster.instances["node"] + + table_name = "test6" + create_table(node, table_name, 1, aggressive_merge=False, data_prefix=table_name) + + node.query("SYSTEM STOP MERGES") + + insert(node, table_name, 0, 5) + insert(node, table_name, 5, 5) + insert(node, table_name, 10, 5) + insert(node, table_name, 15, 5) + + assert ["all_0_0_0", "all_1_1_0", "all_2_2_0", "all_3_3_0"] == get_parts( + node, table_name + ) + + check(node, table_name, 1) + + break_projection(node, table_name, "proj1", "all_1_1_0", "part") + assert "Part all_1_1_0 has a broken projection proj1" in check_table_full( + node, table_name + ) + assert "all_1_1_0\tproj1\tFILE_DOESNT_EXIST" == get_broken_projections_info( + node, table_name + ) + + backup_name = f"b4-{get_random_string()}" + assert "BACKUP_CREATED" in node.query( + f""" + set backup_restore_keeper_fault_injection_probability=0.0; + backup table {table_name} to Disk('backups', '{backup_name}') settings check_projection_parts=false, allow_backup_broken_projections=true; + """ + ) + + assert "RESTORED" in node.query( + f""" + drop table {table_name} sync; + set backup_restore_keeper_fault_injection_probability=0.0; + restore table {table_name} from Disk('backups', '{backup_name}'); + """ + ) + + check(node, table_name, 0) + assert "all_1_1_0\tproj1\tNO_FILE_IN_DATA_PART" == get_broken_projections_info( + node, table_name + ) diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 7382b24afbc..e60fb844de8 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -686,6 +686,9 @@ CREATE TABLE system.projection_parts `rows_where_ttl_info.expression` Array(String), `rows_where_ttl_info.min` Array(DateTime), `rows_where_ttl_info.max` Array(DateTime), + `is_broken` UInt8, + `exception_code` Int32, + `exception` String, `bytes` UInt64 ALIAS bytes_on_disk, `marks_size` UInt64 ALIAS marks_bytes, `part_name` String ALIAS name From 524a2ca72decc124ef1e38b79843c2388cceb0bb Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 27 Feb 2024 19:17:34 +0100 Subject: [PATCH 0055/1081] WIP on createForShard --- .../OptimizeShardingKeyRewriteInVisitor.cpp | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp index 3a592c0fe55..8aca28a90ef 100644 --- a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp +++ b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp @@ -5,6 +5,12 @@ #include #include #include +#include "Analyzer/ColumnNode.h" +#include "Analyzer/ConstantNode.h" +#include "Analyzer/FunctionNode.h" +#include "Analyzer/IQueryTreeNode.h" +#include "Analyzer/InDepthQueryTreeVisitor.h" +#include "DataTypes/IDataType.h" namespace { @@ -119,4 +125,42 @@ void OptimizeShardingKeyRewriteInMatcher::visit(ASTFunction & function, Data & d } } + +class OptimizeShardingKeyRewriteIn : InDepthQueryTreeVisitorWithContext +{ +public: + using Base = InDepthQueryTreeVisitorWithContext; + using Base::Base; + + void enterImpl(QueryTreeNodePtr & node) + { + auto * function_node = node->as(); + if (!function_node || function_node->getFunctionName() != "in") + return; + + auto & arguments = function_node->getArguments().getNodes(); + auto * column = arguments[0]->as(); + if (!column) + return; + + if (!data.sharding_key_expr->getRequiredColumnsWithTypes().contains(column->getColumnName())) + return; + + if (auto * constant = arguments[1]->as()) + { + if (isTuple(constant->getResultType())) + { + auto & tuple = constant->getValue().get(); + std::erase_if(tuple, [&](auto & child) + { + return tuple.size() > 1 && !shardContains(child, name, data); + }); + } + } + } + + OptimizeShardingKeyRewriteInMatcher::Data data; +}; + + } From cb8390e9c8672bcdead0108be75021d6c6f21331 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 28 Feb 2024 13:32:43 +0800 Subject: [PATCH 0056/1081] Fix build --- src/Disks/ObjectStorages/ObjectStorageFactory.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index f64c42c1403..d0c2c9ac4f4 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -16,10 +16,10 @@ #ifndef CLICKHOUSE_KEEPER_STANDALONE_BUILD #include #include -#include #include #endif #include +#include #include #include From 0de2d766fa971f54eff40641e16ed6857e1ece5f Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 28 Feb 2024 15:30:06 +0100 Subject: [PATCH 0057/1081] WIP on different JSONs on shards --- src/Analyzer/IdentifierNode.cpp | 10 +- src/Analyzer/IdentifierNode.h | 6 ++ src/DataTypes/ObjectUtils.cpp | 75 ++++++++++++- src/DataTypes/ObjectUtils.h | 11 ++ .../ClusterProxy/SelectStreamFactory.cpp | 79 +++++++++++++- .../ClusterProxy/SelectStreamFactory.h | 26 +++++ .../ClusterProxy/executeQuery.cpp | 101 ++++++++++++------ src/Interpreters/ClusterProxy/executeQuery.h | 2 - .../OptimizeShardingKeyRewriteInVisitor.cpp | 35 ++++-- .../OptimizeShardingKeyRewriteInVisitor.h | 3 + src/Processors/QueryPlan/ReadFromRemote.cpp | 10 +- src/Storages/StorageDistributed.cpp | 32 +++--- 12 files changed, 325 insertions(+), 65 deletions(-) diff --git a/src/Analyzer/IdentifierNode.cpp b/src/Analyzer/IdentifierNode.cpp index 88b3daacb12..7e4d4c02a4c 100644 --- a/src/Analyzer/IdentifierNode.cpp +++ b/src/Analyzer/IdentifierNode.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -56,13 +57,18 @@ void IdentifierNode::updateTreeHashImpl(HashState & state) const QueryTreeNodePtr IdentifierNode::cloneImpl() const { - return std::make_shared(identifier); + auto result = std::make_shared(identifier); + result->use_parts_for_to_ast = use_parts_for_to_ast; + return result; } ASTPtr IdentifierNode::toASTImpl(const ConvertToASTOptions & /* options */) const { auto identifier_parts = identifier.getParts(); - return std::make_shared(std::move(identifier_parts)); + if (use_parts_for_to_ast) + return std::make_shared(std::move(identifier_parts)); + else + return std::make_shared(identifier.getFullName()); } } diff --git a/src/Analyzer/IdentifierNode.h b/src/Analyzer/IdentifierNode.h index 872bb14d512..3bc37b4c69d 100644 --- a/src/Analyzer/IdentifierNode.h +++ b/src/Analyzer/IdentifierNode.h @@ -52,6 +52,11 @@ public: void dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, size_t indent) const override; + void useFullNameInToAST() + { + use_parts_for_to_ast = false; + } + protected: bool isEqualImpl(const IQueryTreeNode & rhs) const override; @@ -64,6 +69,7 @@ protected: private: Identifier identifier; std::optional table_expression_modifiers; + bool use_parts_for_to_ast = false; static constexpr size_t children_size = 0; }; diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 23d29136c85..01ba50d90f3 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -20,6 +21,16 @@ #include #include #include +#include "Analyzer/ConstantNode.h" +#include "Analyzer/FunctionNode.h" +#include "Analyzer/IQueryTreeNode.h" +#include "Analyzer/Identifier.h" +#include "Analyzer/IdentifierNode.h" +#include "Analyzer/QueryNode.h" +#include "Analyzer/Utils.h" +#include +#include +#include "Common/logger_useful.h" namespace DB @@ -888,10 +899,10 @@ static void addConstantToWithClause(const ASTPtr & query, const String & column_ /// @expected_columns and @available_columns contain descriptions /// of extended Object columns. -void replaceMissedSubcolumnsByConstants( +NamesAndTypes calculateMissedSubcolumns( const ColumnsDescription & expected_columns, - const ColumnsDescription & available_columns, - ASTPtr query) + const ColumnsDescription & available_columns +) { NamesAndTypes missed_names_types; @@ -928,6 +939,18 @@ void replaceMissedSubcolumnsByConstants( [](const auto & lhs, const auto & rhs) { return lhs.name < rhs.name; }); } + return missed_names_types; +} + +/// @expected_columns and @available_columns contain descriptions +/// of extended Object columns. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + ASTPtr query) +{ + NamesAndTypes missed_names_types = calculateMissedSubcolumns(expected_columns, available_columns); + if (missed_names_types.empty()) return; @@ -940,6 +963,52 @@ void replaceMissedSubcolumnsByConstants( addConstantToWithClause(query, name, type); } +/// @expected_columns and @available_columns contain descriptions +/// of extended Object columns. +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + QueryTreeNodePtr & query, + const ContextPtr & context [[maybe_unused]]) +{ + NamesAndTypes missed_names_types = calculateMissedSubcolumns(expected_columns, available_columns); + + if (missed_names_types.empty()) + return; + + auto * query_node = query->as(); + if (!query_node) + return; + + auto table_expression = extractLeftTableExpression(query_node->getJoinTree()); + + auto & with_nodes = query_node->getWith().getNodes(); + + std::unordered_map column_name_to_node; + for (const auto & [name, type] : missed_names_types) + { + auto constant = std::make_shared(type->getDefault(), type); + constant->setAlias(table_expression->getAlias() + name); + // auto materialize = std::make_shared("materialize"); + + // auto function = FunctionFactory::instance().get("materialize", context); + // materialize->getArguments().getNodes() = { constant }; + // materialize->resolveAsFunction(function->build(materialize->getArgumentColumns())); + // materialize->setAlias(name); + + with_nodes.push_back(constant); + + auto id = std::make_shared(Identifier(table_expression->getAlias() + name)); + id->useFullNameInToAST(); + column_name_to_node[name] = id; + LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Name {} Expression\n{}", name, column_name_to_node[name]->dumpTree()); + } + + LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Table expression\n{} ", table_expression->dumpTree()); + replaceColumns(query, table_expression, column_name_to_node); + LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Result:\n{} ", query->dumpTree()); +} + Field FieldVisitorReplaceScalars::operator()(const Array & x) const { if (num_dimensions_to_keep == 0) diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 2bfcaae09ca..f4a8abe8abf 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -3,6 +3,8 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" +#include "Interpreters/Context_fwd.h" #include #include #include @@ -14,6 +16,9 @@ namespace DB struct StorageSnapshot; using StorageSnapshotPtr = std::shared_ptr; +class IQueryTreeNode; +using QueryTreeNodePtr = std::shared_ptr; + /// Returns number of dimensions in Array type. 0 if type is not array. size_t getNumberOfDimensions(const IDataType & type); @@ -97,6 +102,12 @@ void replaceMissedSubcolumnsByConstants( const ColumnsDescription & available_columns, ASTPtr query); +void replaceMissedSubcolumnsByConstants( + const ColumnsDescription & expected_columns, + const ColumnsDescription & available_columns, + QueryTreeNodePtr & query, + const ContextPtr & context); + /// Visitor that keeps @num_dimensions_to_keep dimensions in arrays /// and replaces all scalars or nested arrays to @replacement at that level. class FieldVisitorReplaceScalars : public StaticVisitor diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index f0592735caf..5167ffc0e27 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -5,6 +5,10 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" +#include "Interpreters/InterpreterSelectQueryAnalyzer.h" +#include "Interpreters/SelectQueryOptions.h" +#include "Planner/Utils.h" #include #include #include @@ -124,18 +128,55 @@ void SelectStreamFactory::createForShard( if (it != objects_by_shard.end()) replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, query_ast); + createForShardImpl( + shard_info, + query_ast, + main_table, + table_func_ptr, + std::move(context), + local_plans, + remote_shards, + shard_count, + parallel_replicas_enabled, + std::move(shard_filter_generator)); +} + +void SelectStreamFactory::createForShardImpl( + const Cluster::ShardInfo & shard_info, + const ASTPtr & query_ast, + const StorageID & main_table, + const ASTPtr & table_func_ptr, + ContextPtr context, + std::vector & local_plans, + Shards & remote_shards, + UInt32 shard_count, + bool parallel_replicas_enabled, + AdditionalShardFilterGenerator shard_filter_generator) +{ auto emplace_local_stream = [&]() { + Block shard_header; + if (context->getSettingsRef().allow_experimental_analyzer) + shard_header = InterpreterSelectQueryAnalyzer::getSampleBlock(query_ast, context, SelectQueryOptions(processed_stage).analyze()); + else + shard_header = header; + local_plans.emplace_back(createLocalPlan( - query_ast, header, context, processed_stage, shard_info.shard_num, shard_count)); + query_ast, shard_header, context, processed_stage, shard_info.shard_num, shard_count)); }; auto emplace_remote_stream = [&](bool lazy = false, time_t local_delay = 0) { + Block shard_header; + if (context->getSettingsRef().allow_experimental_analyzer) + shard_header = InterpreterSelectQueryAnalyzer::getSampleBlock(query_ast, context, SelectQueryOptions(processed_stage).analyze()); + else + shard_header = header; + remote_shards.emplace_back(Shard{ .query = query_ast, .main_table = main_table, - .header = header, + .header = shard_header, .shard_info = shard_info, .lazy = lazy, .local_delay = local_delay, @@ -243,6 +284,40 @@ void SelectStreamFactory::createForShard( emplace_remote_stream(); } +void SelectStreamFactory::createForShard( + const Cluster::ShardInfo & shard_info, + const QueryTreeNodePtr & query_tree, + const StorageID & main_table, + const ASTPtr & table_func_ptr, + ContextPtr context, + std::vector & local_plans, + Shards & remote_shards, + UInt32 shard_count, + bool parallel_replicas_enabled, + AdditionalShardFilterGenerator shard_filter_generator) +{ + + auto it = objects_by_shard.find(shard_info.shard_num); + QueryTreeNodePtr modified_query = query_tree; + if (it != objects_by_shard.end()) + replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, modified_query, context); + + auto query_ast = queryNodeToDistributedSelectQuery(modified_query); + + createForShardImpl( + shard_info, + query_ast, + main_table, + table_func_ptr, + std::move(context), + local_plans, + remote_shards, + shard_count, + parallel_replicas_enabled, + std::move(shard_filter_generator)); + +} + } } diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index 9993ea7028d..45d6ea14c01 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -7,6 +7,7 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" namespace DB { @@ -83,10 +84,35 @@ public: bool parallel_replicas_enabled, AdditionalShardFilterGenerator shard_filter_generator); + void createForShard( + const Cluster::ShardInfo & shard_info, + const QueryTreeNodePtr & query_tree, + const StorageID & main_table, + const ASTPtr & table_func_ptr, + ContextPtr context, + std::vector & local_plans, + Shards & remote_shards, + UInt32 shard_count, + bool parallel_replicas_enabled, + AdditionalShardFilterGenerator shard_filter_generator); + const Block header; const ColumnsDescriptionByShardNum objects_by_shard; const StorageSnapshotPtr storage_snapshot; QueryProcessingStage::Enum processed_stage; + +private: + void createForShardImpl( + const Cluster::ShardInfo & shard_info, + const ASTPtr & query_ast, + const StorageID & main_table, + const ASTPtr & table_func_ptr, + ContextPtr context, + std::vector & local_plans, + Shards & remote_shards, + UInt32 shard_count, + bool parallel_replicas_enabled, + AdditionalShardFilterGenerator shard_filter_generator); }; } diff --git a/src/Interpreters/ClusterProxy/executeQuery.cpp b/src/Interpreters/ClusterProxy/executeQuery.cpp index 6cdff939af1..07ef7aa6c96 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.cpp +++ b/src/Interpreters/ClusterProxy/executeQuery.cpp @@ -204,12 +204,10 @@ void executeQuery( const ASTPtr & table_func_ptr, SelectStreamFactory & stream_factory, LoggerPtr log, - const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, const ExpressionActionsPtr & sharding_key_expr, const std::string & sharding_key_column_name, - const ClusterPtr & not_optimized_cluster, const DistributedSettings & distributed_settings, AdditionalShardFilterGenerator shard_filter_generator) { @@ -218,6 +216,8 @@ void executeQuery( if (settings.max_distributed_depth && context->getClientInfo().distributed_depth >= settings.max_distributed_depth) throw Exception(ErrorCodes::TOO_LARGE_DISTRIBUTED_DEPTH, "Maximum distributed depth exceeded"); + const ClusterPtr & not_optimized_cluster = query_info.cluster; + std::vector plans; SelectStreamFactory::Shards remote_shards; @@ -237,40 +237,81 @@ void executeQuery( new_context->increaseDistributedDepth(); const size_t shards = cluster->getShardCount(); - for (size_t i = 0, s = cluster->getShardsInfo().size(); i < s; ++i) + + if (context->getSettingsRef().allow_experimental_analyzer) { - const auto & shard_info = cluster->getShardsInfo()[i]; - - ASTPtr query_ast_for_shard = query_ast->clone(); - if (sharding_key_expr && query_info.optimized_cluster && settings.optimize_skip_unused_shards_rewrite_in && shards > 1) + for (size_t i = 0, s = cluster->getShardsInfo().size(); i < s; ++i) { - OptimizeShardingKeyRewriteInVisitor::Data visitor_data{ - sharding_key_expr, - sharding_key_expr->getSampleBlock().getByPosition(0).type, - sharding_key_column_name, + const auto & shard_info = cluster->getShardsInfo()[i]; + + auto query_for_shard = query_info.query_tree->clone(); + if (sharding_key_expr && query_info.optimized_cluster && settings.optimize_skip_unused_shards_rewrite_in && shards > 1) + { + OptimizeShardingKeyRewriteInVisitor::Data visitor_data{ + sharding_key_expr, + sharding_key_expr->getSampleBlock().getByPosition(0).type, + sharding_key_column_name, + shard_info, + not_optimized_cluster->getSlotToShard(), + }; + optimizeShardingKeyRewriteIn(query_for_shard, std::move(visitor_data), new_context); + } + + // decide for each shard if parallel reading from replicas should be enabled + // according to settings and number of replicas declared per shard + const auto & addresses = cluster->getShardsAddresses().at(i); + bool parallel_replicas_enabled = addresses.size() > 1 && context->canUseTaskBasedParallelReplicas(); + + stream_factory.createForShard( shard_info, - not_optimized_cluster->getSlotToShard(), - }; - OptimizeShardingKeyRewriteInVisitor visitor(visitor_data); - visitor.visit(query_ast_for_shard); + query_for_shard, + main_table, + table_func_ptr, + new_context, + plans, + remote_shards, + static_cast(shards), + parallel_replicas_enabled, + shard_filter_generator); } + } + else + { + for (size_t i = 0, s = cluster->getShardsInfo().size(); i < s; ++i) + { + const auto & shard_info = cluster->getShardsInfo()[i]; - // decide for each shard if parallel reading from replicas should be enabled - // according to settings and number of replicas declared per shard - const auto & addresses = cluster->getShardsAddresses().at(i); - bool parallel_replicas_enabled = addresses.size() > 1 && context->canUseTaskBasedParallelReplicas(); + ASTPtr query_ast_for_shard = query_info.query->clone(); + if (sharding_key_expr && query_info.optimized_cluster && settings.optimize_skip_unused_shards_rewrite_in && shards > 1) + { + OptimizeShardingKeyRewriteInVisitor::Data visitor_data{ + sharding_key_expr, + sharding_key_expr->getSampleBlock().getByPosition(0).type, + sharding_key_column_name, + shard_info, + not_optimized_cluster->getSlotToShard(), + }; + OptimizeShardingKeyRewriteInVisitor visitor(visitor_data); + visitor.visit(query_ast_for_shard); + } - stream_factory.createForShard( - shard_info, - query_ast_for_shard, - main_table, - table_func_ptr, - new_context, - plans, - remote_shards, - static_cast(shards), - parallel_replicas_enabled, - shard_filter_generator); + // decide for each shard if parallel reading from replicas should be enabled + // according to settings and number of replicas declared per shard + const auto & addresses = cluster->getShardsAddresses().at(i); + bool parallel_replicas_enabled = addresses.size() > 1 && context->canUseTaskBasedParallelReplicas(); + + stream_factory.createForShard( + shard_info, + query_ast_for_shard, + main_table, + table_func_ptr, + new_context, + plans, + remote_shards, + static_cast(shards), + parallel_replicas_enabled, + shard_filter_generator); + } } if (!remote_shards.empty()) diff --git a/src/Interpreters/ClusterProxy/executeQuery.h b/src/Interpreters/ClusterProxy/executeQuery.h index bbc3c6c9e49..8f6f6300c7b 100644 --- a/src/Interpreters/ClusterProxy/executeQuery.h +++ b/src/Interpreters/ClusterProxy/executeQuery.h @@ -58,12 +58,10 @@ void executeQuery( const ASTPtr & table_func_ptr, SelectStreamFactory & stream_factory, LoggerPtr log, - const ASTPtr & query_ast, ContextPtr context, const SelectQueryInfo & query_info, const ExpressionActionsPtr & sharding_key_expr, const std::string & sharding_key_column_name, - const ClusterPtr & not_optimized_cluster, const DistributedSettings & distributed_settings, AdditionalShardFilterGenerator shard_filter_generator); diff --git a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp index 8aca28a90ef..42c6e63da01 100644 --- a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp +++ b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp @@ -1,3 +1,4 @@ +#include #include #include #include @@ -11,6 +12,7 @@ #include "Analyzer/IQueryTreeNode.h" #include "Analyzer/InDepthQueryTreeVisitor.h" #include "DataTypes/IDataType.h" +#include "Interpreters/Context_fwd.h" namespace { @@ -126,11 +128,15 @@ void OptimizeShardingKeyRewriteInMatcher::visit(ASTFunction & function, Data & d } -class OptimizeShardingKeyRewriteIn : InDepthQueryTreeVisitorWithContext +class OptimizeShardingKeyRewriteIn : public InDepthQueryTreeVisitorWithContext { public: using Base = InDepthQueryTreeVisitorWithContext; - using Base::Base; + + OptimizeShardingKeyRewriteIn(OptimizeShardingKeyRewriteInVisitor::Data data_, ContextPtr context) + : Base(std::move(context)) + , data(std::move(data_)) + {} void enterImpl(QueryTreeNodePtr & node) { @@ -143,6 +149,8 @@ public: if (!column) return; + auto name = column->getColumnName(); + if (!data.sharding_key_expr->getRequiredColumnsWithTypes().contains(column->getColumnName())) return; @@ -150,17 +158,30 @@ public: { if (isTuple(constant->getResultType())) { - auto & tuple = constant->getValue().get(); - std::erase_if(tuple, [&](auto & child) + const auto & tuple = constant->getValue().get(); + Tuple new_tuple; + new_tuple.reserve(tuple.size()); + + for (const auto & child : tuple) { - return tuple.size() > 1 && !shardContains(child, name, data); - }); + if (shardContains(child, name, data)) + new_tuple.push_back(child); + } + + if (new_tuple.empty()) + new_tuple.push_back(tuple.back()); + node = std::make_shared(new_tuple); } } } - OptimizeShardingKeyRewriteInMatcher::Data data; + OptimizeShardingKeyRewriteInVisitor::Data data; }; +void optimizeShardingKeyRewriteIn(QueryTreeNodePtr & node, OptimizeShardingKeyRewriteInVisitor::Data data, ContextPtr context) +{ + OptimizeShardingKeyRewriteIn visitor(std::move(data), std::move(context)); + visitor.visit(node); +} } diff --git a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h index d546db40df7..d202609160b 100644 --- a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h +++ b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.h @@ -2,6 +2,7 @@ #include #include +#include "Analyzer/IQueryTreeNode.h" namespace DB { @@ -44,4 +45,6 @@ struct OptimizeShardingKeyRewriteInMatcher using OptimizeShardingKeyRewriteInVisitor = InDepthNodeVisitor; +void optimizeShardingKeyRewriteIn(QueryTreeNodePtr & node, OptimizeShardingKeyRewriteInVisitor::Data data, ContextPtr context); + } diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 93c73a66b78..022c4f699f2 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -216,7 +216,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStream }; pipes.emplace_back(createDelayedPipe(shard.header, lazily_create_stream, add_totals, add_extremes)); - addConvertingActions(pipes.back(), output_stream->header); + addConvertingActions(pipes.back(), shard.header); } void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard) @@ -281,7 +281,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact auto remote_query_executor = std::make_shared( shard.shard_info.pool, query_string, - output_stream->header, + shard.header, context, throttler, scalars, @@ -297,7 +297,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); - addConvertingActions(pipes.back(), output_stream->header); + addConvertingActions(pipes.back(), shard.header); } } else @@ -305,7 +305,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact const String query_string = formattedAST(shard.query); auto remote_query_executor = std::make_shared( - shard.shard_info.pool, query_string, output_stream->header, context, throttler, scalars, external_tables, stage); + shard.shard_info.pool, query_string, shard.header, context, throttler, scalars, external_tables, stage); remote_query_executor->setLogger(log); if (context->canUseTaskBasedParallelReplicas()) @@ -326,7 +326,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); - addConvertingActions(pipes.back(), output_stream->header); + addConvertingActions(pipes.back(), shard.header); } } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 92e7dcdf4f2..34ab21a4751 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,6 +30,7 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" #include #include @@ -813,7 +814,8 @@ void StorageDistributed::read( const size_t /*num_streams*/) { Block header; - ASTPtr query_ast; + + SelectQueryInfo modified_query_info = query_info; if (local_context->getSettingsRef().allow_experimental_analyzer) { @@ -821,7 +823,7 @@ void StorageDistributed::read( if (!remote_table_function_ptr) remote_storage_id = StorageID{remote_database, remote_table}; - auto query_tree_distributed = buildQueryTreeDistributed(query_info, + auto query_tree_distributed = buildQueryTreeDistributed(modified_query_info, storage_snapshot, remote_storage_id, remote_table_function_ptr); @@ -831,20 +833,24 @@ void StorageDistributed::read( */ for (auto & column : header) column.column = column.column->convertToFullColumnIfConst(); - query_ast = queryNodeToDistributedSelectQuery(query_tree_distributed); + modified_query_info.query = queryNodeToDistributedSelectQuery(query_tree_distributed); + + modified_query_info.query_tree = std::move(query_tree_distributed); } else { - header = InterpreterSelectQuery(query_info.query, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); - query_ast = query_info.query; + header = InterpreterSelectQuery(modified_query_info.query, local_context, SelectQueryOptions(processed_stage).analyze()).getSampleBlock(); } - const auto & modified_query_ast = ClusterProxy::rewriteSelectQuery( - local_context, query_ast, - remote_database, remote_table, remote_table_function_ptr); + if (!local_context->getSettingsRef().allow_experimental_analyzer) + { + modified_query_info.query = ClusterProxy::rewriteSelectQuery( + local_context, modified_query_info.query, + remote_database, remote_table, remote_table_function_ptr); + } /// Return directly (with correct header) if no shard to query. - if (query_info.getCluster()->getShardsInfo().empty()) + if (modified_query_info.getCluster()->getShardsInfo().empty()) { if (local_context->getSettingsRef().allow_experimental_analyzer) return; @@ -872,7 +878,7 @@ void StorageDistributed::read( const auto & settings = local_context->getSettingsRef(); ClusterProxy::AdditionalShardFilterGenerator additional_shard_filter_generator; - if (local_context->canUseParallelReplicasCustomKey(*query_info.getCluster())) + if (local_context->canUseParallelReplicasCustomKey(*modified_query_info.getCluster())) { if (auto custom_key_ast = parseCustomKeyForTable(settings.parallel_replicas_custom_key, *local_context)) { @@ -881,7 +887,7 @@ void StorageDistributed::read( column_description = this->getInMemoryMetadataPtr()->columns, custom_key_type = settings.parallel_replicas_custom_key_filter_type.value, context = local_context, - replica_count = query_info.getCluster()->getShardsInfo().front().per_replica_pools.size()](uint64_t replica_num) -> ASTPtr + replica_count = modified_query_info.getCluster()->getShardsInfo().front().per_replica_pools.size()](uint64_t replica_num) -> ASTPtr { return getCustomKeyFilterForParallelReplica( replica_count, replica_num - 1, my_custom_key_ast, custom_key_type, column_description, context); @@ -897,12 +903,10 @@ void StorageDistributed::read( remote_table_function_ptr, select_stream_factory, log, - modified_query_ast, local_context, - query_info, + modified_query_info, sharding_key_expr, sharding_key_column_name, - query_info.cluster, distributed_settings, additional_shard_filter_generator); From 8bf7c2c5971afc22dda32f9f4ad453ac481f2359 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 28 Feb 2024 15:40:42 +0100 Subject: [PATCH 0058/1081] Use output header --- src/Processors/QueryPlan/ReadFromRemote.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index 022c4f699f2..fde2313bc15 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -216,7 +216,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStream }; pipes.emplace_back(createDelayedPipe(shard.header, lazily_create_stream, add_totals, add_extremes)); - addConvertingActions(pipes.back(), shard.header); + addConvertingActions(pipes.back(), output_stream->header); } void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard) @@ -297,7 +297,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); - addConvertingActions(pipes.back(), shard.header); + addConvertingActions(pipes.back(), output_stream->header); } } else @@ -326,7 +326,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); - addConvertingActions(pipes.back(), shard.header); + addConvertingActions(pipes.back(), output_stream->header); } } From d2ea882bd8105f5d2e173a6670bf23b2917b3190 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 28 Feb 2024 21:26:19 +0000 Subject: [PATCH 0059/1081] Fix deadlock in parallel parsing when lots of rows are skipped due to errors --- .../Formats/Impl/ParallelParsingInputFormat.cpp | 4 +++- .../03001_parallel_parsing_deadlock.reference | 0 .../0_stateless/03001_parallel_parsing_deadlock.sh | 12 ++++++++++++ 3 files changed, 15 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03001_parallel_parsing_deadlock.reference create mode 100755 tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp index 8b6969bbfcc..447adb1ed48 100644 --- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp +++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp @@ -224,7 +224,9 @@ Chunk ParallelParsingInputFormat::read() /// skipped all rows. For example, it can happen while using settings /// input_format_allow_errors_num/input_format_allow_errors_ratio /// and this segment contained only rows with errors. - /// Process the next unit. + /// Return this empty unit back to segmentator and process the next unit. + unit->status = READY_TO_INSERT; + segmentator_condvar.notify_all(); ++reader_ticket_number; unit = &processing_units[reader_ticket_number % processing_units.size()]; } diff --git a/tests/queries/0_stateless/03001_parallel_parsing_deadlock.reference b/tests/queries/0_stateless/03001_parallel_parsing_deadlock.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh b/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh new file mode 100755 index 00000000000..1bf21dfc53b --- /dev/null +++ b/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +# Tags: no-fasttest, no-cpu-aarch64 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.csv +$CLICKHOUSE_LOCAL -q "select number > 1000000 ? 'error' : toString(number) from numbers(2000000) format CSV" > $DATA_FILE +$CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CSV, 'x UInt64') format Null settings input_format_allow_errors_ratio=1" +rm $DATA_FILE + From 974ba7364f193838f735a9233c6dec4298172542 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 29 Feb 2024 00:55:17 +0100 Subject: [PATCH 0060/1081] better --- src/Disks/ObjectStorages/S3/diskSettings.cpp | 16 +++++++++++++--- src/IO/S3/Client.cpp | 2 ++ src/IO/S3/URI.cpp | 2 +- src/IO/WriteBufferFromS3.h | 2 +- src/Storages/StorageS3.cpp | 4 ++++ 5 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/Disks/ObjectStorages/S3/diskSettings.cpp b/src/Disks/ObjectStorages/S3/diskSettings.cpp index b8688cd3de6..10172805f06 100644 --- a/src/Disks/ObjectStorages/S3/diskSettings.cpp +++ b/src/Disks/ObjectStorages/S3/diskSettings.cpp @@ -1,5 +1,6 @@ #include -#include "IO/S3/Client.h" +#include +#include #if USE_AWS_S3 @@ -10,7 +11,7 @@ #include #include #include -#include "Disks/DiskFactory.h" +#include #include #include @@ -25,6 +26,11 @@ namespace DB { +namespace ErrorCodes +{ +extern const int NO_ELEMENTS_IN_CONFIG; +} + std::unique_ptr getSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { const Settings & settings = context->getSettingsRef(); @@ -47,11 +53,15 @@ std::unique_ptr getClient( const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); const Settings & local_settings = context->getSettingsRef(); - String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); + const String endpoint = context->getMacros()->expand(config.getString(config_prefix + ".endpoint")); S3::URI uri(endpoint); if (!uri.key.ends_with('/')) uri.key.push_back('/'); + if (S3::isS3ExpressEndpoint(endpoint) && !config.has(config_prefix + ".region")) + throw Exception( + ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Region should be explicitly specified for directory buckets ({})", config_prefix); + S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( config.getString(config_prefix + ".region", ""), context->getRemoteHostFilter(), diff --git a/src/IO/S3/Client.cpp b/src/IO/S3/Client.cpp index a75d41df3d1..4f93aba2f84 100644 --- a/src/IO/S3/Client.cpp +++ b/src/IO/S3/Client.cpp @@ -1,4 +1,5 @@ #include +#include #if USE_AWS_S3 @@ -965,6 +966,7 @@ PocoHTTPClientConfiguration ClientFactory::createClientConfiguration( // NOLINT bool isS3ExpressEndpoint(const std::string & endpoint) { + /// On one hand this check isn't 100% reliable, on the other - all it will change is whether we attach checksums to the requests. return endpoint.contains("s3express"); } } diff --git a/src/IO/S3/URI.cpp b/src/IO/S3/URI.cpp index 062d3b80850..027cb624ed5 100644 --- a/src/IO/S3/URI.cpp +++ b/src/IO/S3/URI.cpp @@ -122,7 +122,7 @@ URI::URI(const std::string & uri_) "Object storage system name is unrecognized in virtual hosted style S3 URI: {}", quoteString(name)); - if (name == COS || name == COSN) + if (name == COS) storage_name = COSN; else storage_name = name; diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 148cd27f854..59f4e19e15b 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -100,7 +100,7 @@ private: /// We initiate upload, then upload each part and get ETag as a response, and then finalizeImpl() upload with listing all our parts. String multipart_upload_id; std::deque multipart_tags; - std::deque multipart_checksums; + std::deque multipart_checksums; // if enabled bool multipart_upload_finished = false; /// Track that prefinalize() is called only once diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 044a1ca5362..f96ff8b7eb6 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -133,6 +133,7 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; extern const int CANNOT_COMPILE_REGEXP; extern const int FILE_DOESNT_EXIST; + extern const int NO_ELEMENTS_IN_CONFIG; } @@ -1403,6 +1404,9 @@ void StorageS3::Configuration::connect(const ContextPtr & context) const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); const Settings & local_settings = context->getSettingsRef(); + if (S3::isS3ExpressEndpoint(url.endpoint) && auth_settings.region.empty()) + throw Exception(ErrorCodes::NO_ELEMENTS_IN_CONFIG, "Region should be explicitly specified for directory buckets"); + S3::PocoHTTPClientConfiguration client_configuration = S3::ClientFactory::instance().createClientConfiguration( auth_settings.region, context->getRemoteHostFilter(), From 37917a3ed34df22756562a04a90d3c985ca23bd8 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Thu, 29 Feb 2024 01:42:32 +0100 Subject: [PATCH 0061/1081] better --- src/IO/S3/Requests.h | 12 ++++++++++++ src/IO/WriteBufferFromS3.cpp | 10 +++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/IO/S3/Requests.h b/src/IO/S3/Requests.h index 6f82a0f39d3..196f074c9df 100644 --- a/src/IO/S3/Requests.h +++ b/src/IO/S3/Requests.h @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -39,6 +40,17 @@ inline void setPartChecksum(Model::CompletedPart & part, const std::string & che part.SetChecksumCRC32(checksum); } +inline void setRequestChecksum(Model::UploadPartRequest & req, const std::string & checksum) +{ + req.SetChecksumCRC32(checksum); +} + +inline std::string calculateChecksum(Model::UploadPartRequest & req) +{ + chassert(req.GetChecksumAlgorithm() == Aws::S3::Model::ChecksumAlgorithm::CRC32); + return Aws::Utils::HashingUtils::Base64Encode(Aws::Utils::HashingUtils::CalculateCRC32(*(req.GetBody()))); +} + template inline void setChecksumAlgorithm(R & request) { diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index a162992278f..80ca96b0382 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -18,10 +18,6 @@ #include #include -#include -#include -#include - #include @@ -461,9 +457,9 @@ S3::UploadPartRequest WriteBufferFromS3::getUploadRequest(size_t part_number, Pa /// Checksums need to be provided on CompleteMultipartUpload requests, so we calculate then manually and store in multipart_checksums if (client_ptr->isS3ExpressBucket()) { - chassert(req.GetChecksumAlgorithm() == Aws::S3::Model::ChecksumAlgorithm::CRC32); - req.SetChecksumCRC32(Aws::Utils::HashingUtils::Base64Encode(Aws::Utils::HashingUtils::CalculateCRC32(*(req.GetBody())))); - multipart_checksums.push_back(req.GetChecksumCRC32()); + auto checksum = S3::RequestChecksum::calculateChecksum(req); + S3::RequestChecksum::setRequestChecksum(req, checksum); + multipart_checksums.push_back(std::move(checksum)); } return req; From 6fbd298b3d7cc06b1f11727263a25bc613f7c295 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 29 Feb 2024 05:03:09 +0300 Subject: [PATCH 0062/1081] Revert "Revert "Use `MergeTree` as a default table engine"" --- src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.h | 1 + tests/queries/0_stateless/02184_default_table_engine.sql | 2 ++ 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index ae6ea165cc9..5f52396d3bb 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -566,7 +566,7 @@ class IColumn; M(UInt64, min_free_disk_space_for_temporary_data, 0, "The minimum disk space to keep while writing temporary data used in external sorting and aggregation.", 0) \ \ M(DefaultTableEngine, default_temporary_table_engine, DefaultTableEngine::Memory, "Default table engine used when ENGINE is not set in CREATE TEMPORARY statement.",0) \ - M(DefaultTableEngine, default_table_engine, DefaultTableEngine::None, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ + M(DefaultTableEngine, default_table_engine, DefaultTableEngine::MergeTree, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ M(Bool, show_table_uuid_in_table_create_query_if_not_nil, false, "For tables in databases with Engine=Atomic show UUID of the table in its CREATE query.", 0) \ M(Bool, database_atomic_wait_for_drop_and_detach_synchronously, false, "When executing DROP or DETACH TABLE in Atomic database, wait for table data to be finally dropped or detached.", 0) \ M(Bool, enable_scalar_subquery_optimization, true, "If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index e8d013d13ec..661e7cb80da 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -133,6 +133,7 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, + {"default_table_engine", "None", "MergeTree", "Set default table engine to MergeTree for better usability"}, {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}, {"azure_max_unexpected_write_error_retries", 4, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write"}, diff --git a/tests/queries/0_stateless/02184_default_table_engine.sql b/tests/queries/0_stateless/02184_default_table_engine.sql index a984ec1b6c9..aff30eeea98 100644 --- a/tests/queries/0_stateless/02184_default_table_engine.sql +++ b/tests/queries/0_stateless/02184_default_table_engine.sql @@ -1,3 +1,5 @@ +SET default_table_engine = 'None'; + CREATE TABLE table_02184 (x UInt8); --{serverError 119} SET default_table_engine = 'Log'; CREATE TABLE table_02184 (x UInt8); From 0d4648b535a61561d122c87cf181434215753b35 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Thu, 29 Feb 2024 10:30:17 +0800 Subject: [PATCH 0063/1081] Fix clang-tidy --- src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp index eba57969580..c0b45e1d46a 100644 --- a/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp +++ b/src/Disks/ObjectStorages/Local/LocalObjectStorage.cpp @@ -32,7 +32,7 @@ LocalObjectStorage::LocalObjectStorage(String key_prefix_) else description = "/"; - fs::create_directories(getCommonKeyPrefix()); + fs::create_directories(key_prefix); } bool LocalObjectStorage::exists(const StoredObject & object) const From f8561b2265b924c64c60bdbc5305785c0f0b6f2e Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Thu, 29 Feb 2024 13:53:27 +0100 Subject: [PATCH 0064/1081] Revert "Revert "Support resource request canceling"" --- docs/en/operations/system-tables/scheduler.md | 4 + src/Common/Scheduler/ISchedulerNode.h | 2 + src/Common/Scheduler/ISchedulerQueue.h | 6 ++ src/Common/Scheduler/Nodes/FairPolicy.h | 99 ++++++++++--------- src/Common/Scheduler/Nodes/FifoQueue.h | 31 ++++-- src/Common/Scheduler/Nodes/PriorityPolicy.h | 38 ++++--- .../tests/gtest_dynamic_resource_manager.cpp | 1 - .../Nodes/tests/gtest_resource_scheduler.cpp | 63 ++++++++++++ src/Common/Scheduler/ResourceGuard.h | 9 +- src/Common/Scheduler/ResourceRequest.cpp | 13 +++ src/Common/Scheduler/ResourceRequest.h | 30 +++--- src/Common/Scheduler/SchedulerRoot.h | 32 +++--- .../System/StorageSystemScheduler.cpp | 4 + 13 files changed, 224 insertions(+), 108 deletions(-) create mode 100644 src/Common/Scheduler/ResourceRequest.cpp diff --git a/docs/en/operations/system-tables/scheduler.md b/docs/en/operations/system-tables/scheduler.md index 953db4c28f2..c4de7f76fdc 100644 --- a/docs/en/operations/system-tables/scheduler.md +++ b/docs/en/operations/system-tables/scheduler.md @@ -26,7 +26,9 @@ priority: 0 is_active: 0 active_children: 0 dequeued_requests: 67 +canceled_requests: 0 dequeued_cost: 4692272 +canceled_cost: 0 busy_periods: 63 vruntime: 938454.1999999989 system_vruntime: ᴺᵁᴸᴸ @@ -54,7 +56,9 @@ Columns: - `is_active` (`UInt8`) - Whether this node is currently active - has resource requests to be dequeued and constraints satisfied. - `active_children` (`UInt64`) - The number of children in active state. - `dequeued_requests` (`UInt64`) - The total number of resource requests dequeued from this node. +- `canceled_requests` (`UInt64`) - The total number of resource requests canceled from this node. - `dequeued_cost` (`UInt64`) - The sum of costs (e.g. size in bytes) of all requests dequeued from this node. +- `canceled_cost` (`UInt64`) - The sum of costs (e.g. size in bytes) of all requests canceled from this node. - `busy_periods` (`UInt64`) - The total number of deactivations of this node. - `vruntime` (`Nullable(Float64)`) - For children of `fair` nodes only. Virtual runtime of a node used by SFQ algorithm to select the next child to process in a max-min fair manner. - `system_vruntime` (`Nullable(Float64)`) - For `fair` nodes only. Virtual runtime showing `vruntime` of the last processed resource request. Used during child activation as the new value of `vruntime`. diff --git a/src/Common/Scheduler/ISchedulerNode.h b/src/Common/Scheduler/ISchedulerNode.h index 804026d7bf4..20c1f4332da 100644 --- a/src/Common/Scheduler/ISchedulerNode.h +++ b/src/Common/Scheduler/ISchedulerNode.h @@ -387,7 +387,9 @@ public: /// Introspection std::atomic dequeued_requests{0}; + std::atomic canceled_requests{0}; std::atomic dequeued_cost{0}; + std::atomic canceled_cost{0}; std::atomic busy_periods{0}; }; diff --git a/src/Common/Scheduler/ISchedulerQueue.h b/src/Common/Scheduler/ISchedulerQueue.h index cbe63bd304a..532f4bf6c63 100644 --- a/src/Common/Scheduler/ISchedulerQueue.h +++ b/src/Common/Scheduler/ISchedulerQueue.h @@ -50,6 +50,12 @@ public: /// Should be called outside of scheduling subsystem, implementation must be thread-safe. virtual void enqueueRequest(ResourceRequest * request) = 0; + /// Cancel previously enqueued request. + /// Returns `false` and does nothing given unknown or already executed request. + /// Returns `true` if requests has been found and canceled. + /// Should be called outside of scheduling subsystem, implementation must be thread-safe. + virtual bool cancelRequest(ResourceRequest * request) = 0; + /// For introspection ResourceCost getBudget() const { diff --git a/src/Common/Scheduler/Nodes/FairPolicy.h b/src/Common/Scheduler/Nodes/FairPolicy.h index c0e187e6fa9..ce2bf729a04 100644 --- a/src/Common/Scheduler/Nodes/FairPolicy.h +++ b/src/Common/Scheduler/Nodes/FairPolicy.h @@ -134,56 +134,65 @@ public: std::pair dequeueRequest() override { - if (heap_size == 0) - return {nullptr, false}; - - // Recursively pull request from child - auto [request, child_active] = items.front().child->dequeueRequest(); - assert(request != nullptr); - std::pop_heap(items.begin(), items.begin() + heap_size); - Item & current = items[heap_size - 1]; - - // SFQ fairness invariant: system vruntime equals last served request start-time - assert(current.vruntime >= system_vruntime); - system_vruntime = current.vruntime; - - // By definition vruntime is amount of consumed resource (cost) divided by weight - current.vruntime += double(request->cost) / current.child->info.weight; - max_vruntime = std::max(max_vruntime, current.vruntime); - - if (child_active) // Put active child back in heap after vruntime update + // Cycle is required to do deactivations in the case of canceled requests, when dequeueRequest returns `nullptr` + while (true) { - std::push_heap(items.begin(), items.begin() + heap_size); - } - else // Deactivate child if it is empty, but remember it's vruntime for latter activations - { - heap_size--; + if (heap_size == 0) + return {nullptr, false}; - // Store index of this inactive child in `parent.idx` - // This enables O(1) search of inactive children instead of O(n) - current.child->info.parent.idx = heap_size; - } + // Recursively pull request from child + auto [request, child_active] = items.front().child->dequeueRequest(); + std::pop_heap(items.begin(), items.begin() + heap_size); + Item & current = items[heap_size - 1]; - // Reset any difference between children on busy period end - if (heap_size == 0) - { - // Reset vtime to zero to avoid floating-point error accumulation, - // but do not reset too often, because it's O(N) - UInt64 ns = clock_gettime_ns(); - if (last_reset_ns + 1000000000 < ns) + if (request) { - last_reset_ns = ns; - for (Item & item : items) - item.vruntime = 0; - max_vruntime = 0; - } - system_vruntime = max_vruntime; - busy_periods++; - } + // SFQ fairness invariant: system vruntime equals last served request start-time + assert(current.vruntime >= system_vruntime); + system_vruntime = current.vruntime; - dequeued_requests++; - dequeued_cost += request->cost; - return {request, heap_size > 0}; + // By definition vruntime is amount of consumed resource (cost) divided by weight + current.vruntime += double(request->cost) / current.child->info.weight; + max_vruntime = std::max(max_vruntime, current.vruntime); + } + + if (child_active) // Put active child back in heap after vruntime update + { + std::push_heap(items.begin(), items.begin() + heap_size); + } + else // Deactivate child if it is empty, but remember it's vruntime for latter activations + { + heap_size--; + + // Store index of this inactive child in `parent.idx` + // This enables O(1) search of inactive children instead of O(n) + current.child->info.parent.idx = heap_size; + } + + // Reset any difference between children on busy period end + if (heap_size == 0) + { + // Reset vtime to zero to avoid floating-point error accumulation, + // but do not reset too often, because it's O(N) + UInt64 ns = clock_gettime_ns(); + if (last_reset_ns + 1000000000 < ns) + { + last_reset_ns = ns; + for (Item & item : items) + item.vruntime = 0; + max_vruntime = 0; + } + system_vruntime = max_vruntime; + busy_periods++; + } + + if (request) + { + dequeued_requests++; + dequeued_cost += request->cost; + return {request, heap_size > 0}; + } + } } bool isActive() override diff --git a/src/Common/Scheduler/Nodes/FifoQueue.h b/src/Common/Scheduler/Nodes/FifoQueue.h index 38ae902bc2f..45ed32343ff 100644 --- a/src/Common/Scheduler/Nodes/FifoQueue.h +++ b/src/Common/Scheduler/Nodes/FifoQueue.h @@ -39,8 +39,7 @@ public: void enqueueRequest(ResourceRequest * request) override { - std::unique_lock lock(mutex); - request->enqueue_ns = clock_gettime_ns(); + std::lock_guard lock(mutex); queue_cost += request->cost; bool was_empty = requests.empty(); requests.push_back(request); @@ -50,7 +49,7 @@ public: std::pair dequeueRequest() override { - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); if (requests.empty()) return {nullptr, false}; ResourceRequest * result = requests.front(); @@ -63,9 +62,29 @@ public: return {result, !requests.empty()}; } + bool cancelRequest(ResourceRequest * request) override + { + std::lock_guard lock(mutex); + // TODO(serxa): reimplement queue as intrusive list of ResourceRequest to make this O(1) instead of O(N) + for (auto i = requests.begin(), e = requests.end(); i != e; ++i) + { + if (*i == request) + { + requests.erase(i); + if (requests.empty()) + busy_periods++; + queue_cost -= request->cost; + canceled_requests++; + canceled_cost += request->cost; + return true; + } + } + return false; + } + bool isActive() override { - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); return !requests.empty(); } @@ -98,14 +117,14 @@ public: std::pair getQueueLengthAndCost() { - std::unique_lock lock(mutex); + std::lock_guard lock(mutex); return {requests.size(), queue_cost}; } private: std::mutex mutex; Int64 queue_cost = 0; - std::deque requests; + std::deque requests; // TODO(serxa): reimplement it using intrusive list to avoid allocations/deallocations and O(N) during cancel }; } diff --git a/src/Common/Scheduler/Nodes/PriorityPolicy.h b/src/Common/Scheduler/Nodes/PriorityPolicy.h index 6d6b15bd063..9b4cfc37f8c 100644 --- a/src/Common/Scheduler/Nodes/PriorityPolicy.h +++ b/src/Common/Scheduler/Nodes/PriorityPolicy.h @@ -102,25 +102,31 @@ public: std::pair dequeueRequest() override { - if (items.empty()) - return {nullptr, false}; - - // Recursively pull request from child - auto [request, child_active] = items.front().child->dequeueRequest(); - assert(request != nullptr); - - // Deactivate child if it is empty - if (!child_active) + // Cycle is required to do deactivations in the case of canceled requests, when dequeueRequest returns `nullptr` + while (true) { - std::pop_heap(items.begin(), items.end()); - items.pop_back(); if (items.empty()) - busy_periods++; - } + return {nullptr, false}; - dequeued_requests++; - dequeued_cost += request->cost; - return {request, !items.empty()}; + // Recursively pull request from child + auto [request, child_active] = items.front().child->dequeueRequest(); + + // Deactivate child if it is empty + if (!child_active) + { + std::pop_heap(items.begin(), items.end()); + items.pop_back(); + if (items.empty()) + busy_periods++; + } + + if (request) + { + dequeued_requests++; + dequeued_cost += request->cost; + return {request, !items.empty()}; + } + } } bool isActive() override diff --git a/src/Common/Scheduler/Nodes/tests/gtest_dynamic_resource_manager.cpp b/src/Common/Scheduler/Nodes/tests/gtest_dynamic_resource_manager.cpp index 961a3b6f713..cdf09776077 100644 --- a/src/Common/Scheduler/Nodes/tests/gtest_dynamic_resource_manager.cpp +++ b/src/Common/Scheduler/Nodes/tests/gtest_dynamic_resource_manager.cpp @@ -38,7 +38,6 @@ TEST(SchedulerDynamicResourceManager, Smoke) { ResourceGuard gA(cA->get("res1"), ResourceGuard::PostponeLocking); gA.lock(); - gA.setFailure(); gA.unlock(); ResourceGuard gB(cB->get("res1")); diff --git a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp index 9fefbc02cbd..e76639a4b01 100644 --- a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp +++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp @@ -4,6 +4,7 @@ #include +#include #include using namespace DB; @@ -73,6 +74,22 @@ struct ResourceHolder } }; +struct MyRequest : public ResourceRequest +{ + std::function on_execute; + + explicit MyRequest(ResourceCost cost_, std::function on_execute_) + : ResourceRequest(cost_) + , on_execute(on_execute_) + {} + + void execute() override + { + if (on_execute) + on_execute(); + } +}; + TEST(SchedulerRoot, Smoke) { ResourceTest t; @@ -111,3 +128,49 @@ TEST(SchedulerRoot, Smoke) EXPECT_TRUE(fc2->requests.contains(&rg.request)); } } + +TEST(SchedulerRoot, Cancel) +{ + ResourceTest t; + + ResourceHolder r1(t); + auto * fc1 = r1.add("/", "1"); + r1.add("/prio"); + auto a = r1.addQueue("/prio/A", "1"); + auto b = r1.addQueue("/prio/B", "2"); + r1.registerResource(); + + std::barrier sync(2); + std::thread consumer1([&] + { + std::barrier destruct_sync(2); + MyRequest request(1,[&] + { + sync.arrive_and_wait(); // (A) + EXPECT_TRUE(fc1->requests.contains(&request)); + sync.arrive_and_wait(); // (B) + request.finish(); + destruct_sync.arrive_and_wait(); // (C) + }); + a.queue->enqueueRequest(&request); + destruct_sync.arrive_and_wait(); // (C) + }); + + std::thread consumer2([&] + { + MyRequest request(1,[&] + { + FAIL() << "This request must be canceled, but instead executes"; + }); + sync.arrive_and_wait(); // (A) wait for request of consumer1 to be inside execute, so that constraint is in violated state and our request will not be executed immediately + b.queue->enqueueRequest(&request); + bool canceled = b.queue->cancelRequest(&request); + EXPECT_TRUE(canceled); + sync.arrive_and_wait(); // (B) release request of consumer1 to be finished + }); + + consumer1.join(); + consumer2.join(); + + EXPECT_TRUE(fc1->requests.empty()); +} diff --git a/src/Common/Scheduler/ResourceGuard.h b/src/Common/Scheduler/ResourceGuard.h index dca4041b176..50f665a384b 100644 --- a/src/Common/Scheduler/ResourceGuard.h +++ b/src/Common/Scheduler/ResourceGuard.h @@ -71,8 +71,7 @@ public: // lock(mutex) is not required because `Dequeued` request cannot be used by the scheduler thread chassert(state == Dequeued); state = Finished; - if (constraint) - constraint->finishRequest(this); + ResourceRequest::finish(); } static Request & local() @@ -126,12 +125,6 @@ public: } } - /// Mark request as unsuccessful; by default request is considered to be successful - void setFailure() - { - request.successful = false; - } - ResourceLink link; Request & request; }; diff --git a/src/Common/Scheduler/ResourceRequest.cpp b/src/Common/Scheduler/ResourceRequest.cpp new file mode 100644 index 00000000000..26e8084cdfa --- /dev/null +++ b/src/Common/Scheduler/ResourceRequest.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace DB +{ + +void ResourceRequest::finish() +{ + if (constraint) + constraint->finishRequest(this); +} + +} diff --git a/src/Common/Scheduler/ResourceRequest.h b/src/Common/Scheduler/ResourceRequest.h index 3d2230746f9..f3153ad382c 100644 --- a/src/Common/Scheduler/ResourceRequest.h +++ b/src/Common/Scheduler/ResourceRequest.h @@ -14,9 +14,6 @@ class ISchedulerConstraint; using ResourceCost = Int64; constexpr ResourceCost ResourceCostMax = std::numeric_limits::max(); -/// Timestamps (nanoseconds since epoch) -using ResourceNs = UInt64; - /* * Request for a resource consumption. The main moving part of the scheduling subsystem. * Resource requests processing workflow: @@ -31,7 +28,7 @@ using ResourceNs = UInt64; * 3) Scheduler calls ISchedulerNode::dequeueRequest() that returns the request. * 4) Callback ResourceRequest::execute() is called to provide access to the resource. * 5) The resource consumption is happening outside of the scheduling subsystem. - * 6) request->constraint->finishRequest() is called when consumption is finished. + * 6) ResourceRequest::finish() is called when consumption is finished. * * Steps (5) and (6) can be omitted if constraint is not used by the resource. * @@ -39,7 +36,10 @@ using ResourceNs = UInt64; * Request ownership is done outside of the scheduling subsystem. * After (6) request can be destructed safely. * - * Request cancelling is not supported yet. + * Request can also be canceled before (3) using ISchedulerQueue::cancelRequest(). + * Returning false means it is too late for request to be canceled. It should be processed in a regular way. + * Returning true means successful cancel and therefore steps (4) and (5) are not going to happen + * and step (6) MUST be omitted. */ class ResourceRequest { @@ -48,32 +48,20 @@ public: /// NOTE: If cost is not known in advance, ResourceBudget should be used (note that every ISchedulerQueue has it) ResourceCost cost; - /// Request outcome - /// Should be filled during resource consumption - bool successful; - /// Scheduler node to be notified on consumption finish /// Auto-filled during request enqueue/dequeue ISchedulerConstraint * constraint; - /// Timestamps for introspection - ResourceNs enqueue_ns; - ResourceNs execute_ns; - ResourceNs finish_ns; - explicit ResourceRequest(ResourceCost cost_ = 1) { reset(cost_); } + /// ResourceRequest object may be reused again after reset() void reset(ResourceCost cost_) { cost = cost_; - successful = true; constraint = nullptr; - enqueue_ns = 0; - execute_ns = 0; - finish_ns = 0; } virtual ~ResourceRequest() = default; @@ -83,6 +71,12 @@ public: /// just triggering start of a consumption, not doing the consumption itself /// (e.g. setting an std::promise or creating a job in a thread pool) virtual void execute() = 0; + + /// Stop resource consumption and notify resource scheduler. + /// Should be called when resource consumption is finished by consumer. + /// ResourceRequest should not be destructed or reset before calling to `finish()`. + /// WARNING: this function MUST not be called if request was canceled. + void finish(); }; } diff --git a/src/Common/Scheduler/SchedulerRoot.h b/src/Common/Scheduler/SchedulerRoot.h index 3a23a8df834..ab3f702a422 100644 --- a/src/Common/Scheduler/SchedulerRoot.h +++ b/src/Common/Scheduler/SchedulerRoot.h @@ -145,22 +145,27 @@ public: std::pair dequeueRequest() override { - if (current == nullptr) // No active resources - return {nullptr, false}; + while (true) + { + if (current == nullptr) // No active resources + return {nullptr, false}; - // Dequeue request from current resource - auto [request, resource_active] = current->root->dequeueRequest(); - assert(request != nullptr); + // Dequeue request from current resource + auto [request, resource_active] = current->root->dequeueRequest(); - // Deactivate resource if required - if (!resource_active) - deactivate(current); - else - current = current->next; // Just move round-robin pointer + // Deactivate resource if required + if (!resource_active) + deactivate(current); + else + current = current->next; // Just move round-robin pointer - dequeued_requests++; - dequeued_cost += request->cost; - return {request, current != nullptr}; + if (request == nullptr) // Possible in case of request cancel, just retry + continue; + + dequeued_requests++; + dequeued_cost += request->cost; + return {request, current != nullptr}; + } } bool isActive() override @@ -245,7 +250,6 @@ private: void execute(ResourceRequest * request) { - request->execute_ns = clock_gettime_ns(); request->execute(); } diff --git a/src/Storages/System/StorageSystemScheduler.cpp b/src/Storages/System/StorageSystemScheduler.cpp index ba07d44dbf9..633bac5d285 100644 --- a/src/Storages/System/StorageSystemScheduler.cpp +++ b/src/Storages/System/StorageSystemScheduler.cpp @@ -30,7 +30,9 @@ ColumnsDescription StorageSystemScheduler::getColumnsDescription() {"is_active", std::make_shared(), "Whether this node is currently active - has resource requests to be dequeued and constraints satisfied."}, {"active_children", std::make_shared(), "The number of children in active state."}, {"dequeued_requests", std::make_shared(), "The total number of resource requests dequeued from this node."}, + {"canceled_requests", std::make_shared(), "The total number of resource requests canceled from this node."}, {"dequeued_cost", std::make_shared(), "The sum of costs (e.g. size in bytes) of all requests dequeued from this node."}, + {"canceled_cost", std::make_shared(), "The sum of costs (e.g. size in bytes) of all requests canceled from this node."}, {"busy_periods", std::make_shared(), "The total number of deactivations of this node."}, {"vruntime", std::make_shared(std::make_shared()), "For children of `fair` nodes only. Virtual runtime of a node used by SFQ algorithm to select the next child to process in a max-min fair manner."}, @@ -93,7 +95,9 @@ void StorageSystemScheduler::fillData(MutableColumns & res_columns, ContextPtr c res_columns[i++]->insert(node->isActive()); res_columns[i++]->insert(node->activeChildren()); res_columns[i++]->insert(node->dequeued_requests.load()); + res_columns[i++]->insert(node->canceled_requests.load()); res_columns[i++]->insert(node->dequeued_cost.load()); + res_columns[i++]->insert(node->canceled_cost.load()); res_columns[i++]->insert(node->busy_periods.load()); Field vruntime; From 8b1a1d42daa01e946aa8102d683dbab90b447838 Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Thu, 29 Feb 2024 18:07:00 +0300 Subject: [PATCH 0065/1081] Traverse shadow directory for system.remote_data_paths --- src/Core/Settings.h | 1 + src/Disks/IDisk.h | 5 ++++- src/Disks/ObjectStorages/DiskObjectStorage.cpp | 10 ++++++++-- src/Disks/ObjectStorages/DiskObjectStorage.h | 5 ++++- .../System/StorageSystemRemoteDataPaths.cpp | 13 +++++++++++++ 5 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d77b3a45188..7cf068d7f1f 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -843,6 +843,7 @@ class IColumn; M(Bool, optimize_uniq_to_count, true, "Rewrite uniq and its variants(except uniqUpTo) to count if subquery has distinct or group by clause.", 0) \ M(Bool, use_variant_as_common_type, false, "Use Variant as a result type for if/multiIf in case when there is no common type for arguments", 0) \ M(Bool, enable_order_by_all, true, "Enable sorting expression ORDER BY ALL.", 0) \ + M(Bool, traverse_shadow_remote_data_paths, false, "Traverse shadow directory when query system.remote_data_paths", 0) \ \ /** Experimental functions */ \ M(Bool, allow_experimental_materialized_postgresql_table, false, "Allows to use the MaterializedPostgreSQL table engine. Disabled by default, because this feature is experimental", 0) \ diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 3d228850537..62b02938d1a 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -320,7 +320,10 @@ public: {} }; - virtual void getRemotePathsRecursive(const String &, std::vector &) + virtual void getRemotePathsRecursive( + const String &, + std::vector &, + const std::function & /* skip_predicate */ = {}) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method `getRemotePathsRecursive() not implemented for disk: {}`", diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.cpp b/src/Disks/ObjectStorages/DiskObjectStorage.cpp index 2a648f28f14..460d242d5cd 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.cpp +++ b/src/Disks/ObjectStorages/DiskObjectStorage.cpp @@ -90,11 +90,17 @@ StoredObjects DiskObjectStorage::getStorageObjects(const String & local_path) co return metadata_storage->getStorageObjects(local_path); } -void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std::vector & paths_map) +void DiskObjectStorage::getRemotePathsRecursive( + const String & local_path, + std::vector & paths_map, + const std::function & skip_predicate) { if (!metadata_storage->exists(local_path)) return; + if (skip_predicate && skip_predicate(local_path)) + return; + /// Protect against concurrent delition of files (for example because of a merge). if (metadata_storage->isFile(local_path)) { @@ -142,7 +148,7 @@ void DiskObjectStorage::getRemotePathsRecursive(const String & local_path, std:: } for (; it->isValid(); it->next()) - DiskObjectStorage::getRemotePathsRecursive(fs::path(local_path) / it->name(), paths_map); + DiskObjectStorage::getRemotePathsRecursive(fs::path(local_path) / it->name(), paths_map, skip_predicate); } } diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index e1576509713..d7af656bea3 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -48,7 +48,10 @@ public: StoredObjects getStorageObjects(const String & local_path) const override; - void getRemotePathsRecursive(const String & local_path, std::vector & paths_map) override; + void getRemotePathsRecursive( + const String & local_path, + std::vector & paths_map, + const std::function & skip_predicate = {}) override; const std::string & getCacheName() const override { return object_storage->getCacheName(); } diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp index 87b7a84e8ba..708c1369965 100644 --- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp +++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp @@ -9,6 +9,7 @@ #include #include +namespace fs = std::filesystem; namespace DB { @@ -59,6 +60,18 @@ Pipe StorageSystemRemoteDataPaths::read( std::vector remote_paths_by_local_path; disk->getRemotePathsRecursive("store", remote_paths_by_local_path); disk->getRemotePathsRecursive("data", remote_paths_by_local_path); + if (context->getSettingsRef().traverse_shadow_remote_data_paths) + disk->getRemotePathsRecursive( + "shadow", + remote_paths_by_local_path, + [](const String & local_path) + { + // `shadow/{backup_name}/revision.txt` is not an object metadata file + const auto path = fs::path(local_path); + return path.filename() == "revision.txt" && + path.parent_path().has_parent_path() && + path.parent_path().parent_path().filename() == "shadow"; + }); FileCachePtr cache; From 5641fd8ba9c4f27794367e22632365df5cdf0303 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 29 Feb 2024 16:13:05 +0100 Subject: [PATCH 0066/1081] Fix build after merge --- src/IO/WriteBufferFromS3TaskTracker.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/IO/WriteBufferFromS3TaskTracker.h index 134abbbc4c1..4061f084a76 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.h +++ b/src/IO/WriteBufferFromS3TaskTracker.h @@ -1,7 +1,7 @@ #pragma once #include "config.h" -#include +#include #include "WriteBufferFromS3.h" #include From 7632c2c33f357c1c616f734c7bf2502ccbfbd496 Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 29 Feb 2024 15:17:12 +0000 Subject: [PATCH 0067/1081] Remove non-deterministic functions in virtual columns filter --- src/Storages/MergeTree/MergeTreeData.cpp | 2 ++ src/Storages/VirtualColumnUtils.cpp | 21 +++++++++++++++++++ ...with_non_deterministic_functions.reference | 11 ++++++++++ ...lumns_with_non_deterministic_functions.sql | 6 ++++++ 4 files changed, 40 insertions(+) create mode 100644 tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.reference create mode 100644 tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.sql diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 8aa188cfe5c..6494ed5d844 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -1082,6 +1082,8 @@ std::optional MergeTreeData::totalRowsByPartitionPredicateImpl( Block virtual_columns_block = getBlockWithVirtualPartColumns(parts, true /* one_part */); auto filter_dag = VirtualColumnUtils::splitFilterDagForAllowedInputs(filter_actions_dag->getOutputs().at(0), nullptr); + if (!filter_dag) + return {}; // Generate valid expressions for filtering bool valid = true; diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 33ff6e7104f..3e0ef1d7990 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -467,6 +467,23 @@ static bool canEvaluateSubtree(const ActionsDAG::Node * node, const Block & allo return true; } +static bool isDeterministic(const ActionsDAG::Node * node) +{ + if (node->type != ActionsDAG::ActionType::FUNCTION) + return true; + + if (!node->function_base->isDeterministic()) + return false; + + for (const auto * child : node->children) + { + if (!isDeterministic(child)) + return false; + } + + return true; +} + static const ActionsDAG::Node * splitFilterNodeForAllowedInputs( const ActionsDAG::Node * node, const Block * allowed_inputs, @@ -542,6 +559,10 @@ static const ActionsDAG::Node * splitFilterNodeForAllowedInputs( } } } + else if (!isDeterministic(node)) + { + return nullptr; + } } if (allowed_inputs && !canEvaluateSubtree(node, *allowed_inputs)) diff --git a/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.reference b/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.reference new file mode 100644 index 00000000000..4c9646d6ffa --- /dev/null +++ b/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.reference @@ -0,0 +1,11 @@ +0 +0 +0 +0 +0 +0 +0 +0 +0 +0 +1 diff --git a/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.sql b/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.sql new file mode 100644 index 00000000000..9f8bc6bd3d7 --- /dev/null +++ b/tests/queries/0_stateless/03002_filter_skip_virtual_columns_with_non_deterministic_functions.sql @@ -0,0 +1,6 @@ +create table test (number UInt64) engine=MergeTree order by number; +insert into test select * from numbers(100000000); +select ignore(number) from test where RAND() > 4292390314 limit 10; +select count() > 0 from test where RAND() > 4292390314; +drop table test; + From 09a392772d75b38e1b19ad6bd2a863168ea0de5c Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 29 Feb 2024 15:34:45 +0000 Subject: [PATCH 0068/1081] Use isDeterministicInScopeOfQuery --- src/Storages/VirtualColumnUtils.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 3e0ef1d7990..6d66453442e 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -467,17 +467,17 @@ static bool canEvaluateSubtree(const ActionsDAG::Node * node, const Block & allo return true; } -static bool isDeterministic(const ActionsDAG::Node * node) +static bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node) { if (node->type != ActionsDAG::ActionType::FUNCTION) return true; - if (!node->function_base->isDeterministic()) + if (!node->function_base->isDeterministicInScopeOfQuery()) return false; for (const auto * child : node->children) { - if (!isDeterministic(child)) + if (!isDeterministicInScopeOfQuery(child)) return false; } @@ -559,7 +559,7 @@ static const ActionsDAG::Node * splitFilterNodeForAllowedInputs( } } } - else if (!isDeterministic(node)) + else if (!isDeterministicInScopeOfQuery(node)) { return nullptr; } From 55053dae4459b1d1a6c05d436d1ab421a96c3934 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 29 Feb 2024 19:18:06 +0100 Subject: [PATCH 0069/1081] Some progress --- src/Analyzer/IdentifierNode.cpp | 10 +--- src/Analyzer/IdentifierNode.h | 6 --- src/DataTypes/ObjectUtils.cpp | 24 +++++----- src/DataTypes/ObjectUtils.h | 4 +- .../ClusterProxy/SelectStreamFactory.cpp | 10 ++-- .../ClusterProxy/SelectStreamFactory.h | 8 +++- src/Processors/QueryPlan/ReadFromRemote.cpp | 47 +++++++++++++++++++ .../test_distributed_type_object/test.py | 2 +- 8 files changed, 80 insertions(+), 31 deletions(-) diff --git a/src/Analyzer/IdentifierNode.cpp b/src/Analyzer/IdentifierNode.cpp index 7e4d4c02a4c..88b3daacb12 100644 --- a/src/Analyzer/IdentifierNode.cpp +++ b/src/Analyzer/IdentifierNode.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -57,18 +56,13 @@ void IdentifierNode::updateTreeHashImpl(HashState & state) const QueryTreeNodePtr IdentifierNode::cloneImpl() const { - auto result = std::make_shared(identifier); - result->use_parts_for_to_ast = use_parts_for_to_ast; - return result; + return std::make_shared(identifier); } ASTPtr IdentifierNode::toASTImpl(const ConvertToASTOptions & /* options */) const { auto identifier_parts = identifier.getParts(); - if (use_parts_for_to_ast) - return std::make_shared(std::move(identifier_parts)); - else - return std::make_shared(identifier.getFullName()); + return std::make_shared(std::move(identifier_parts)); } } diff --git a/src/Analyzer/IdentifierNode.h b/src/Analyzer/IdentifierNode.h index 3bc37b4c69d..872bb14d512 100644 --- a/src/Analyzer/IdentifierNode.h +++ b/src/Analyzer/IdentifierNode.h @@ -52,11 +52,6 @@ public: void dumpTreeImpl(WriteBuffer & buffer, FormatState & format_state, size_t indent) const override; - void useFullNameInToAST() - { - use_parts_for_to_ast = false; - } - protected: bool isEqualImpl(const IQueryTreeNode & rhs) const override; @@ -69,7 +64,6 @@ protected: private: Identifier identifier; std::optional table_expression_modifiers; - bool use_parts_for_to_ast = false; static constexpr size_t children_size = 0; }; diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 01ba50d90f3..47d8c5c9113 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -965,30 +965,32 @@ void replaceMissedSubcolumnsByConstants( /// @expected_columns and @available_columns contain descriptions /// of extended Object columns. -void replaceMissedSubcolumnsByConstants( +MissingObjectList replaceMissedSubcolumnsByConstants( const ColumnsDescription & expected_columns, const ColumnsDescription & available_columns, QueryTreeNodePtr & query, const ContextPtr & context [[maybe_unused]]) { + MissingObjectList missed_list; + NamesAndTypes missed_names_types = calculateMissedSubcolumns(expected_columns, available_columns); if (missed_names_types.empty()) - return; + return missed_list; auto * query_node = query->as(); if (!query_node) - return; + return missed_list; + + missed_list.reserve(missed_names_types.size()); auto table_expression = extractLeftTableExpression(query_node->getJoinTree()); - auto & with_nodes = query_node->getWith().getNodes(); - std::unordered_map column_name_to_node; for (const auto & [name, type] : missed_names_types) { auto constant = std::make_shared(type->getDefault(), type); - constant->setAlias(table_expression->getAlias() + name); + constant->setAlias(table_expression->getAlias() + "." + name); // auto materialize = std::make_shared("materialize"); // auto function = FunctionFactory::instance().get("materialize", context); @@ -996,17 +998,17 @@ void replaceMissedSubcolumnsByConstants( // materialize->resolveAsFunction(function->build(materialize->getArgumentColumns())); // materialize->setAlias(name); - with_nodes.push_back(constant); - - auto id = std::make_shared(Identifier(table_expression->getAlias() + name)); - id->useFullNameInToAST(); - column_name_to_node[name] = id; + column_name_to_node[name] = buildCastFunction(constant, type, context); + missed_list.push_back({ constant->getValueStringRepresentation() + "_" + constant->getResultType()->getName(), table_expression->getAlias() + "." + name }); + LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "{} -> {}", missed_list.back().first, missed_list.back().second); LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Name {} Expression\n{}", name, column_name_to_node[name]->dumpTree()); } LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Table expression\n{} ", table_expression->dumpTree()); replaceColumns(query, table_expression, column_name_to_node); LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Result:\n{} ", query->dumpTree()); + + return missed_list; } Field FieldVisitorReplaceScalars::operator()(const Array & x) const diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index f4a8abe8abf..013e525832e 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -102,7 +102,9 @@ void replaceMissedSubcolumnsByConstants( const ColumnsDescription & available_columns, ASTPtr query); -void replaceMissedSubcolumnsByConstants( +using MissingObjectList = std::vector>; + +MissingObjectList replaceMissedSubcolumnsByConstants( const ColumnsDescription & expected_columns, const ColumnsDescription & available_columns, QueryTreeNodePtr & query, diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index 5167ffc0e27..5bcd1ce68cb 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -151,7 +151,8 @@ void SelectStreamFactory::createForShardImpl( Shards & remote_shards, UInt32 shard_count, bool parallel_replicas_enabled, - AdditionalShardFilterGenerator shard_filter_generator) + AdditionalShardFilterGenerator shard_filter_generator, + MissingObjectList missed_list) { auto emplace_local_stream = [&]() { @@ -177,6 +178,7 @@ void SelectStreamFactory::createForShardImpl( .query = query_ast, .main_table = main_table, .header = shard_header, + .missing_object_list = std::move(missed_list), .shard_info = shard_info, .lazy = lazy, .local_delay = local_delay, @@ -299,8 +301,9 @@ void SelectStreamFactory::createForShard( auto it = objects_by_shard.find(shard_info.shard_num); QueryTreeNodePtr modified_query = query_tree; + MissingObjectList missed_list; if (it != objects_by_shard.end()) - replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, modified_query, context); + missed_list = replaceMissedSubcolumnsByConstants(storage_snapshot->object_columns, it->second, modified_query, context); auto query_ast = queryNodeToDistributedSelectQuery(modified_query); @@ -314,7 +317,8 @@ void SelectStreamFactory::createForShard( remote_shards, shard_count, parallel_replicas_enabled, - std::move(shard_filter_generator)); + std::move(shard_filter_generator), + std::move(missed_list)); } diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index 45d6ea14c01..bee7edb3c19 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include #include @@ -43,6 +44,8 @@ ASTPtr rewriteSelectQuery( using ColumnsDescriptionByShardNum = std::unordered_map; using AdditionalShardFilterGenerator = std::function; +using MissingObjectList = std::vector>; + class SelectStreamFactory { public: @@ -55,6 +58,8 @@ public: StorageID main_table; Block header; + MissingObjectList missing_object_list; + Cluster::ShardInfo shard_info; /// If we connect to replicas lazily. @@ -112,7 +117,8 @@ private: Shards & remote_shards, UInt32 shard_count, bool parallel_replicas_enabled, - AdditionalShardFilterGenerator shard_filter_generator); + AdditionalShardFilterGenerator shard_filter_generator, + MissingObjectList missed_list = {}); }; } diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index fde2313bc15..ac507c6d555 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -14,6 +15,7 @@ #include #include #include +#include "DataTypes/ObjectUtils.h" #include #include #include @@ -31,6 +33,48 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } +static void addRenamingActions(Pipe & pipe, const MissingObjectList & missed_list, const Block & output_header) +{ + if (missed_list.empty()) + return; + + const auto & output_columns = output_header.getColumnsWithTypeAndName(); + std::vector indexes; + for (size_t i = 0; i < output_columns.size(); ++i) + { + bool found = false; + for (auto const & elem : missed_list) + { + if (output_columns[i].name.contains(elem.second)) + { + found = true; + break; + } + } + if (found) + indexes.push_back(i); + } + + auto dag = std::make_shared(pipe.getHeader().getColumnsWithTypeAndName()); + + for (size_t index : indexes) + { + dag->addOrReplaceInOutputs(dag->addAlias(*dag->getOutputs()[index], output_header.getByPosition(index).name)); + } + + // dag->addAliases(rename_to_apply); + + auto convert_actions = std::make_shared(dag); + pipe.addSimpleTransform([&](const Block & cur_header, Pipe::StreamType) -> ProcessorPtr + { + return std::make_shared(cur_header, convert_actions); + }); + + LOG_DEBUG(&Poco::Logger::get("addRenamingActions"), "EXPECTED:\n{}", output_header.dumpStructure()); + + LOG_DEBUG(&Poco::Logger::get("addRenamingActions"), "{}", pipe.getHeader().dumpStructure()); +} + static void addConvertingActions(Pipe & pipe, const Block & header) { if (blocksHaveEqualStructure(pipe.getHeader(), header)) @@ -216,6 +260,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStream }; pipes.emplace_back(createDelayedPipe(shard.header, lazily_create_stream, add_totals, add_extremes)); + addRenamingActions(pipes.back(), shard.missing_object_list, output_stream->header); addConvertingActions(pipes.back(), output_stream->header); } @@ -297,6 +342,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); + addRenamingActions(pipes.back(), shard.missing_object_list, output_stream->header); addConvertingActions(pipes.back(), output_stream->header); } } @@ -326,6 +372,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); + addRenamingActions(pipes.back(), shard.missing_object_list, output_stream->header); addConvertingActions(pipes.back(), output_stream->header); } } diff --git a/tests/integration/test_distributed_type_object/test.py b/tests/integration/test_distributed_type_object/test.py index b2179af8a3f..f77e0248f02 100644 --- a/tests/integration/test_distributed_type_object/test.py +++ b/tests/integration/test_distributed_type_object/test.py @@ -59,7 +59,7 @@ def test_distributed_type_object(started_cluster): ) expected = TSV("120\n") - assert TSV(node1.query("SELECT sum(data.k2 * id) FROM dist_table")) == expected + assert TSV(node1.query("SELECT sum(data.k2 * id) FROM dist_table SETTINGS optimize_arithmetic_operations_in_aggregate_functions = 0")) == expected node1.query("TRUNCATE TABLE local_table") node2.query("TRUNCATE TABLE local_table") From 41deadda359ca02528fa6ffe9ecfed09c36b364a Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 29 Feb 2024 18:37:00 +0000 Subject: [PATCH 0070/1081] Automatic style fix --- tests/integration/test_distributed_type_object/test.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_distributed_type_object/test.py b/tests/integration/test_distributed_type_object/test.py index f77e0248f02..7e6c000cb8e 100644 --- a/tests/integration/test_distributed_type_object/test.py +++ b/tests/integration/test_distributed_type_object/test.py @@ -59,7 +59,14 @@ def test_distributed_type_object(started_cluster): ) expected = TSV("120\n") - assert TSV(node1.query("SELECT sum(data.k2 * id) FROM dist_table SETTINGS optimize_arithmetic_operations_in_aggregate_functions = 0")) == expected + assert ( + TSV( + node1.query( + "SELECT sum(data.k2 * id) FROM dist_table SETTINGS optimize_arithmetic_operations_in_aggregate_functions = 0" + ) + ) + == expected + ) node1.query("TRUNCATE TABLE local_table") node2.query("TRUNCATE TABLE local_table") From a6cebad52bf4f29984db99cd4d4aa1eb41c50895 Mon Sep 17 00:00:00 2001 From: Nataly Merezhuk Date: Thu, 29 Feb 2024 16:32:29 -0500 Subject: [PATCH 0071/1081] Adds note on supported PostgreSQL versions. --- docs/en/engines/table-engines/integrations/postgresql.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/engines/table-engines/integrations/postgresql.md b/docs/en/engines/table-engines/integrations/postgresql.md index 131df1a435b..9cc4b11243e 100644 --- a/docs/en/engines/table-engines/integrations/postgresql.md +++ b/docs/en/engines/table-engines/integrations/postgresql.md @@ -8,6 +8,10 @@ sidebar_label: PostgreSQL The PostgreSQL engine allows to perform `SELECT` and `INSERT` queries on data that is stored on a remote PostgreSQL server. +:::note +Currently, only PostgreSQL versions 12 and up are supported. +::: + ## Creating a Table {#creating-a-table} ``` sql From 0f2d47e5a444bf78ffef6b2506e50079e6bb55c9 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 1 Mar 2024 10:52:44 +0100 Subject: [PATCH 0072/1081] Renamed WriteBufferFromS3TaskTracker to ThreadPoolTaskTracker --- .../ThreadPoolTaskTracker.cpp} | 2 +- .../ThreadPoolTaskTracker.h} | 6 +++--- src/Disks/IO/WriteBufferFromAzureBlobStorage.h | 2 +- src/IO/WriteBufferFromS3.cpp | 2 +- src/IO/WriteBufferFromS3.h | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) rename src/{IO/WriteBufferFromS3TaskTracker.cpp => Common/ThreadPoolTaskTracker.cpp} (99%) rename src/{IO/WriteBufferFromS3TaskTracker.h => Common/ThreadPoolTaskTracker.h} (94%) diff --git a/src/IO/WriteBufferFromS3TaskTracker.cpp b/src/Common/ThreadPoolTaskTracker.cpp similarity index 99% rename from src/IO/WriteBufferFromS3TaskTracker.cpp rename to src/Common/ThreadPoolTaskTracker.cpp index e62de261fc2..10207eb6296 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.cpp +++ b/src/Common/ThreadPoolTaskTracker.cpp @@ -1,6 +1,6 @@ #include "config.h" -#include +#include "ThreadPoolTaskTracker.h" namespace ProfileEvents { diff --git a/src/IO/WriteBufferFromS3TaskTracker.h b/src/Common/ThreadPoolTaskTracker.h similarity index 94% rename from src/IO/WriteBufferFromS3TaskTracker.h rename to src/Common/ThreadPoolTaskTracker.h index 4061f084a76..d37b759a913 100644 --- a/src/IO/WriteBufferFromS3TaskTracker.h +++ b/src/Common/ThreadPoolTaskTracker.h @@ -1,10 +1,10 @@ #pragma once #include "config.h" -#include -#include "WriteBufferFromS3.h" +#include "threadPoolCallbackRunner.h" +#include "IO/WriteBufferFromS3.h" -#include +#include "logger_useful.h" #include diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index 2d11014fa2a..4897ca9a846 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -11,7 +11,7 @@ #include #include #include -#include +#include namespace Poco diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 6fc0a35672f..510d9bef4d3 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -4,8 +4,8 @@ #include "StdIStreamFromMemory.h" #include "WriteBufferFromS3.h" -#include "WriteBufferFromS3TaskTracker.h" +#include #include #include #include diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index 28754d180bf..afd8b9909c1 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include From 6143986b6d79c0262f5f7dc3052ec2a3f4cfc490 Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Fri, 1 Mar 2024 14:55:02 +0300 Subject: [PATCH 0073/1081] Add query test --- ...raverse_shadow_system_data_paths.reference | 3 ++ ...03000_traverse_shadow_system_data_paths.sh | 34 +++++++++++++++++++ 2 files changed, 37 insertions(+) create mode 100644 tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference create mode 100755 tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh diff --git a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference new file mode 100644 index 00000000000..e8183f05f5d --- /dev/null +++ b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference @@ -0,0 +1,3 @@ +1 +1 +1 diff --git a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh new file mode 100755 index 00000000000..a22cb200f9a --- /dev/null +++ b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh @@ -0,0 +1,34 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +TABLE="03000_traverse_shadow_system_data_path_table" +BACKUP="03000_traverse_shadow_system_data_path_backup" + +${CLICKHOUSE_CLIENT} --query="CREATE TABLE ${TABLE} ( + id Int64, + data String +) ENGINE=MergeTree() +ORDER BY id +SETTINGS storage_policy='s3_cache';" + +${CLICKHOUSE_CLIENT} --query="INSERT INTO ${TABLE} VALUES (0, 'data');" +${CLICKHOUSE_CLIENT} --query "SELECT count() > 0 FROM system.remote_data_paths WHERE disk_name = 's3_cache'" + +${CLICKHOUSE_CLIENT} --query="ALTER TABLE ${TABLE} FREEZE WITH NAME '${BACKUP}';" +${CLICKHOUSE_CLIENT} --query="DROP TABLE ${TABLE} SYNC;" + +${CLICKHOUSE_CLIENT} --query " + SELECT count() > 0 + FROM system.remote_data_paths + WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%' + SETTINGS traverse_shadow_remote_data_paths=1;" +${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" >/dev/null +${CLICKHOUSE_CLIENT} --query " + SELECT count() == 0 + FROM system.remote_data_paths + WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%' + SETTINGS traverse_shadow_remote_data_paths=1;" From a7aeb4c00f106d396364bf2a21697e329d3d284d Mon Sep 17 00:00:00 2001 From: Peter Date: Fri, 1 Mar 2024 23:44:58 +0800 Subject: [PATCH 0074/1081] Add --now option to enable and start the service --- packages/clickhouse-server.postinstall | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/clickhouse-server.postinstall b/packages/clickhouse-server.postinstall index d3b49db758f..41d4405a790 100644 --- a/packages/clickhouse-server.postinstall +++ b/packages/clickhouse-server.postinstall @@ -36,7 +36,7 @@ if [ "$1" = configure ] || [ -n "$not_deb_os" ]; then fi /bin/systemctl daemon-reload - /bin/systemctl enable clickhouse-server + /bin/systemctl enable --now clickhouse-server else # If you downgrading to version older than 1.1.54336 run: systemctl disable clickhouse-server if [ -x "/etc/init.d/clickhouse-server" ]; then From 3825cb3ad0d7f2296cf075648d022ef26f1e0cef Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy Date: Sat, 2 Mar 2024 15:28:45 +0000 Subject: [PATCH 0075/1081] expand CTE in alter modify query --- src/Interpreters/InterpreterAlterQuery.cpp | 11 +++++++++++ .../0_stateless/03002_modify_query_cte.reference | 2 ++ .../0_stateless/03002_modify_query_cte.sql | 15 +++++++++++++++ 3 files changed, 28 insertions(+) create mode 100644 tests/queries/0_stateless/03002_modify_query_cte.reference create mode 100644 tests/queries/0_stateless/03002_modify_query_cte.sql diff --git a/src/Interpreters/InterpreterAlterQuery.cpp b/src/Interpreters/InterpreterAlterQuery.cpp index b768593da98..7acaf95becc 100644 --- a/src/Interpreters/InterpreterAlterQuery.cpp +++ b/src/Interpreters/InterpreterAlterQuery.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -71,11 +72,15 @@ BlockIO InterpreterAlterQuery::execute() BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) { + ASTSelectWithUnionQuery * modify_query = nullptr; + for (auto & child : alter.command_list->children) { auto * command_ast = child->as(); if (command_ast->sql_security) InterpreterCreateQuery::processSQLSecurityOption(getContext(), command_ast->sql_security->as()); + else if (command_ast->type == ASTAlterCommand::MODIFY_QUERY) + modify_query = command_ast->select->as(); } BlockIO res; @@ -123,6 +128,12 @@ BlockIO InterpreterAlterQuery::executeToTable(const ASTAlterQuery & alter) throw Exception(ErrorCodes::TABLE_IS_READ_ONLY, "Table is read-only"); auto table_lock = table->lockForShare(getContext()->getCurrentQueryId(), getContext()->getSettingsRef().lock_acquire_timeout); + if (modify_query) + { + // Expand CTE before filling default database + ApplyWithSubqueryVisitor().visit(*modify_query); + } + /// Add default database to table identifiers that we can encounter in e.g. default expressions, mutation expression, etc. AddDefaultDatabaseVisitor visitor(getContext(), table_id.getDatabaseName()); ASTPtr command_list_ptr = alter.command_list->ptr(); diff --git a/tests/queries/0_stateless/03002_modify_query_cte.reference b/tests/queries/0_stateless/03002_modify_query_cte.reference new file mode 100644 index 00000000000..a3d66f70f8f --- /dev/null +++ b/tests/queries/0_stateless/03002_modify_query_cte.reference @@ -0,0 +1,2 @@ +CREATE MATERIALIZED VIEW default.mv_03002 TO default.table_03002\n(\n `ts` DateTime\n)\nAS SELECT ts\nFROM default.table_03002 +CREATE MATERIALIZED VIEW default.mv_03002 TO default.table_03002\n(\n `ts` DateTime\n)\nAS WITH MY_CTE AS\n (\n SELECT ts\n FROM default.table_03002\n )\nSELECT *\nFROM\nMY_CTE diff --git a/tests/queries/0_stateless/03002_modify_query_cte.sql b/tests/queries/0_stateless/03002_modify_query_cte.sql new file mode 100644 index 00000000000..3a36ce7e7fd --- /dev/null +++ b/tests/queries/0_stateless/03002_modify_query_cte.sql @@ -0,0 +1,15 @@ + +CREATE TABLE table_03002 (ts DateTime, event_type String) ENGINE = MergeTree ORDER BY (event_type, ts); + +CREATE MATERIALIZED VIEW mv_03002 TO table_03002 AS SELECT ts FROM table_03002; + +SHOW CREATE TABLE mv_03002; + +ALTER TABLE mv_03002 MODIFY QUERY +WITH MY_CTE AS (SELECT ts FROM table_03002) +SELECT * FROM MY_CTE; + +SHOW CREATE TABLE mv_03002; + +DROP TABLE mv_03002; +DROP TABLE table_03002; From 17413ded759ebcef809e03a80284f6f805507560 Mon Sep 17 00:00:00 2001 From: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> Date: Sat, 2 Mar 2024 11:11:44 -0500 Subject: [PATCH 0076/1081] Update 03002_modify_query_cte.reference --- tests/queries/0_stateless/03002_modify_query_cte.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03002_modify_query_cte.reference b/tests/queries/0_stateless/03002_modify_query_cte.reference index a3d66f70f8f..50e4a7c6a07 100644 --- a/tests/queries/0_stateless/03002_modify_query_cte.reference +++ b/tests/queries/0_stateless/03002_modify_query_cte.reference @@ -1,2 +1,2 @@ CREATE MATERIALIZED VIEW default.mv_03002 TO default.table_03002\n(\n `ts` DateTime\n)\nAS SELECT ts\nFROM default.table_03002 -CREATE MATERIALIZED VIEW default.mv_03002 TO default.table_03002\n(\n `ts` DateTime\n)\nAS WITH MY_CTE AS\n (\n SELECT ts\n FROM default.table_03002\n )\nSELECT *\nFROM\nMY_CTE +CREATE MATERIALIZED VIEW default.mv_03002 TO default.table_03002\n(\n `ts` DateTime\n)\nAS WITH MY_CTE AS\n (\n SELECT ts\n FROM default.table_03002\n )\nSELECT *\nFROM MY_CTE From a6cb302ab54082db5650263d6417052f81f30710 Mon Sep 17 00:00:00 2001 From: serxa Date: Sun, 3 Mar 2024 15:48:49 +0000 Subject: [PATCH 0077/1081] fix 'AddressSanitizer: stack-use-after-return' --- src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp index e76639a4b01..f8196d15819 100644 --- a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp +++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp @@ -140,10 +140,10 @@ TEST(SchedulerRoot, Cancel) auto b = r1.addQueue("/prio/B", "2"); r1.registerResource(); + std::barrier destruct_sync(2); std::barrier sync(2); std::thread consumer1([&] { - std::barrier destruct_sync(2); MyRequest request(1,[&] { sync.arrive_and_wait(); // (A) From 671b0f678afcdcb354a85aa141920bff09e2bcb2 Mon Sep 17 00:00:00 2001 From: M1eyu2018 <857037797@qq.com> Date: Mon, 4 Mar 2024 10:12:27 +0800 Subject: [PATCH 0078/1081] Add positional read in libhdfs3 Signed-off-by: M1eyu2018 <857037797@qq.com> --- contrib/libhdfs3 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/libhdfs3 b/contrib/libhdfs3 index b9598e60167..0d04201c453 160000 --- a/contrib/libhdfs3 +++ b/contrib/libhdfs3 @@ -1 +1 @@ -Subproject commit b9598e6016720a7c088bfe85ce1fa0410f9d2103 +Subproject commit 0d04201c45359f0d0701fb1e8297d25eff7cfecf From c435d5894f48d37478454b1934d000fb967e2973 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 4 Mar 2024 14:23:59 +0800 Subject: [PATCH 0079/1081] remove wrong assertion n quantileGK --- .../AggregateFunctionGroupArray.cpp | 13 ++++++++----- .../AggregateFunctionQuantileGK.cpp | 12 ++++-------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp index d72ddb42d9e..6af8b1018dd 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp @@ -182,11 +182,14 @@ public: if constexpr (Trait::sampler == Sampler::NONE) { - if (limit_num_elems && cur_elems.value.size() >= max_elems) + if constexpr (limit_num_elems) { - if constexpr (Trait::last) - cur_elems.value[(cur_elems.total_values - 1) % max_elems] = row_value; - return; + if (cur_elems.value.size() >= max_elems) + { + if constexpr (Trait::last) + cur_elems.value[(cur_elems.total_values - 1) % max_elems] = row_value; + return; + } } cur_elems.value.push_back(row_value, arena); @@ -236,7 +239,7 @@ public: void mergeNoSampler(Data & cur_elems, const Data & rhs_elems, Arena * arena) const { - if (!limit_num_elems) + if constexpr (!limit_num_elems) { if (rhs_elems.value.size()) cur_elems.value.insertByOffsets(rhs_elems.value, 0, rhs_elems.value.size(), arena); diff --git a/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp b/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp index 2e8ccb2e5e4..26737e43eef 100644 --- a/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp +++ b/src/AggregateFunctions/AggregateFunctionQuantileGK.cpp @@ -144,7 +144,7 @@ public: count = other.count; compressed = other.compressed; - sampled.resize(other.sampled.size()); + sampled.resize_exact(other.sampled.size()); memcpy(sampled.data(), other.sampled.data(), sizeof(Stats) * other.sampled.size()); return; } @@ -180,7 +180,7 @@ public: compress(); backup_sampled.clear(); - backup_sampled.reserve(sampled.size() + other.sampled.size()); + backup_sampled.reserve_exact(sampled.size() + other.sampled.size()); double merged_relative_error = std::max(relative_error, other.relative_error); size_t merged_count = count + other.count; Int64 additional_self_delta = static_cast(std::floor(2 * other.relative_error * other.count)); @@ -268,11 +268,7 @@ public: size_t sampled_len = 0; readBinaryLittleEndian(sampled_len, buf); - if (sampled_len > compress_threshold) - throw Exception( - ErrorCodes::INCORRECT_DATA, "The number of elements {} for quantileGK exceeds {}", sampled_len, compress_threshold); - - sampled.resize(sampled_len); + sampled.resize_exact(sampled_len); for (size_t i = 0; i < sampled_len; ++i) { @@ -317,7 +313,7 @@ private: ::sort(head_sampled.begin(), head_sampled.end()); backup_sampled.clear(); - backup_sampled.reserve(sampled.size() + head_sampled.size()); + backup_sampled.reserve_exact(sampled.size() + head_sampled.size()); size_t sample_idx = 0; size_t ops_idx = 0; From 1768b4477f4ff5db238cd4cc553587b136ed015d Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 3 Mar 2024 11:50:22 +0100 Subject: [PATCH 0080/1081] Revert "Merge pull request #60690 from ClickHouse/remove-bad-test-8" This reverts commit c77eb8b1427f98daf63f7087bbdc0530b07db825, reversing changes made to bae4783fe9bd25decc41383a1234b0e936284c21. --- ..._external_tables_memory_tracking.reference | 16 ++++++ ...52_http_external_tables_memory_tracking.sh | 51 +++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference create mode 100755 tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference new file mode 100644 index 00000000000..1fc09c8d154 --- /dev/null +++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.reference @@ -0,0 +1,16 @@ +Checking input_format_parallel_parsing=false& +1 +Checking input_format_parallel_parsing=false&cancel_http_readonly_queries_on_client_close=1&readonly=1 +1 +Checking input_format_parallel_parsing=false&send_progress_in_http_headers=true +1 +Checking input_format_parallel_parsing=false&cancel_http_readonly_queries_on_client_close=1&readonly=1&send_progress_in_http_headers=true +1 +Checking input_format_parallel_parsing=true& +1 +Checking input_format_parallel_parsing=true&cancel_http_readonly_queries_on_client_close=1&readonly=1 +1 +Checking input_format_parallel_parsing=true&send_progress_in_http_headers=true +1 +Checking input_format_parallel_parsing=true&cancel_http_readonly_queries_on_client_close=1&readonly=1&send_progress_in_http_headers=true +1 diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh new file mode 100755 index 00000000000..5f9eb460e44 --- /dev/null +++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh @@ -0,0 +1,51 @@ +#!/usr/bin/env bash +# Tags: no-tsan, no-cpu-aarch64, no-parallel +# TSan does not supports tracing. +# trace_log doesn't work on aarch64 + +# Regression for proper release of Context, +# via tracking memory of external tables. + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +tmp_file=$(mktemp "$CURDIR/clickhouse.XXXXXX.csv") +trap 'rm $tmp_file' EXIT + +$CLICKHOUSE_CLIENT -q "SELECT toString(number) FROM numbers(1e6) FORMAT TSV" > "$tmp_file" + +function run_and_check() +{ + local query_id + query_id="$(${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- <<<'SELECT generateUUIDv4()')" + + echo "Checking $*" + + # Run query with external table (implicit StorageMemory user) + $CLICKHOUSE_CURL -sS -F "s=@$tmp_file;" "$CLICKHOUSE_URL&s_structure=key+Int&query=SELECT+count()+FROM+s&memory_profiler_sample_probability=1&max_untracked_memory=0&query_id=$query_id&$*" -o /dev/null + + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}" --data-binary @- <<<'SYSTEM FLUSH LOGS' + + # Check that temporary table had been destroyed. + ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&allow_introspection_functions=1" --data-binary @- <<<" + WITH arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS sym + SELECT count()>0 FROM system.trace_log + WHERE + sym LIKE '%DB::StorageMemory::drop%\n%TemporaryTableHolder::~TemporaryTableHolder%' AND + query_id = '$query_id' + " +} + +for input_format_parallel_parsing in false true; do + query_args_variants=( + "" + "cancel_http_readonly_queries_on_client_close=1&readonly=1" + "send_progress_in_http_headers=true" + # nested progress callback + "cancel_http_readonly_queries_on_client_close=1&readonly=1&send_progress_in_http_headers=true" + ) + for query_args in "${query_args_variants[@]}"; do + run_and_check "input_format_parallel_parsing=$input_format_parallel_parsing&$query_args" + done +done From 048a042dc4963631a23358d3e454dcd8a9eaafa2 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Sun, 3 Mar 2024 11:50:46 +0100 Subject: [PATCH 0081/1081] Make 02152_http_external_tables_memory_tracking less flaky Signed-off-by: Azat Khuzhin --- .../02152_http_external_tables_memory_tracking.sh | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh index 5f9eb460e44..5494f7d59cb 100755 --- a/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh +++ b/tests/queries/0_stateless/02152_http_external_tables_memory_tracking.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-tsan, no-cpu-aarch64, no-parallel +# Tags: no-tsan, no-cpu-aarch64, no-parallel, no-debug # TSan does not supports tracing. # trace_log doesn't work on aarch64 @@ -30,10 +30,16 @@ function run_and_check() # Check that temporary table had been destroyed. ${CLICKHOUSE_CURL} -sS "${CLICKHOUSE_URL}&allow_introspection_functions=1" --data-binary @- <<<" WITH arrayStringConcat(arrayMap(x -> demangle(addressToSymbol(x)), trace), '\n') AS sym - SELECT count()>0 FROM system.trace_log + SELECT 1 FROM system.trace_log + PREWHERE + query_id = '$query_id' AND + trace_type = 'MemorySample' AND + /* only deallocations */ + size < 0 AND + event_date >= yesterday() WHERE - sym LIKE '%DB::StorageMemory::drop%\n%TemporaryTableHolder::~TemporaryTableHolder%' AND - query_id = '$query_id' + sym LIKE '%DB::StorageMemory::drop%\n%TemporaryTableHolder::~TemporaryTableHolder%' + LIMIT 1 " } From a7db6688edb50f894457c414b207c25548bb18d3 Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 4 Mar 2024 18:24:24 +0800 Subject: [PATCH 0082/1081] Update ObjectStorageFactory.cpp --- src/Disks/ObjectStorages/ObjectStorageFactory.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 47c02f87b23..a0578ac4454 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -246,12 +246,11 @@ void registerAzureObjectStorage(ObjectStorageFactory & factory) bool /* skip_access_check */) -> ObjectStoragePtr { AzureBlobStorageEndpoint endpoint = processAzureBlobStorageEndpoint(config, config_prefix); - return std::make_unique( + return createObjectStorage( ObjectStorageType::Azure, config, config_prefix, name, getAzureBlobContainerClient(config, config_prefix), getAzureBlobStorageSettings(config, config_prefix, context), endpoint.prefix.empty() ? endpoint.container_name : endpoint.container_name + "/" + endpoint.prefix); - }); } #endif From 81185815a48b36d344bda623dd175c30e9b87ba3 Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Mon, 4 Mar 2024 14:09:31 +0300 Subject: [PATCH 0083/1081] Update settings_changes_history --- src/Core/SettingsChangesHistory.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 4805df46d9b..b8793f437d8 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -85,6 +85,9 @@ namespace SettingsChangesHistory /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972) static std::map settings_changes_history = { + {"24.3", { + {"traverse_shadow_remote_data_paths", false, false, "Traverse shadow directory when query system.remote_data_paths."}, + }}, {"24.2", { {"output_format_values_escape_quote_with_quote", false, false, "If true escape ' with '', otherwise quoted with \\'"}, {"input_format_try_infer_exponent_floats", true, false, "Don't infer floats in exponential notation by default"}, From fbdc5e305365e9d93b86ed47144ffb13c1ce70c1 Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Mon, 4 Mar 2024 17:16:51 +0300 Subject: [PATCH 0084/1081] Ignore flaky fail of system unfreeze --- .../0_stateless/03000_traverse_shadow_system_data_paths.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh index a22cb200f9a..2905d7801ca 100755 --- a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh +++ b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh @@ -26,7 +26,7 @@ ${CLICKHOUSE_CLIENT} --query " FROM system.remote_data_paths WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%' SETTINGS traverse_shadow_remote_data_paths=1;" -${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" >/dev/null +${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" &>/dev/null || true ${CLICKHOUSE_CLIENT} --query " SELECT count() == 0 FROM system.remote_data_paths From aa43885ac81924a73e9a151a550e7c1af43d23e2 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 5 Mar 2024 10:57:25 +0800 Subject: [PATCH 0085/1081] [improve] add check the remaining disk size before copying --- src/Storages/MergeTree/MergeTreeData.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 849ceb1b66d..d8680958c21 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7160,14 +7160,18 @@ std::pair MergeTreeData::cloneAn { try { + auto reservation_space = src_part_storage->reserve(src_part->getBytesOnDisk()); + if (!reservation_space) { + throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space on disk."); + } dst_part_storage = src_part_storage->clonePart(this->getRelativeDataPath(), tmp_dst_part_name, disk, read_settings, write_settings, {}, {}); copy_successful = true; break; } - catch (...) + catch (Exception & e) { - LOG_TRACE(&Poco::Logger::get("MergeTreeData"), "Clone part on disk {} fail", disk->getName()); + LOG_TRACE(&Poco::Logger::get("MergeTreeData"), "Clone part on disk {} fail: {}", disk->getName(), e.what()); } } if (!copy_successful) @@ -7291,6 +7295,9 @@ std::pair MergeTreeData::cloneAn { try { + auto reservation_space = src_part_storage->reserve(src_part->getBytesOnDisk()); + if (!reservation_space) + throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space on disk."); dst_part_storage = src_part_storage->clonePart(this->getRelativeDataPath(), tmp_dst_part_name, disk, read_settings, write_settings, {}, {}); copy_successful = true; From 758a75c1b46fa27a88e3dcf6e70a18dcf41d62ef Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Tue, 5 Mar 2024 09:53:30 +0300 Subject: [PATCH 0086/1081] Fix flaky test. Fix clang-tidy warning --- src/Disks/IDisk.h | 7 +++---- src/Disks/ObjectStorages/DiskObjectStorage.h | 2 +- src/Storages/System/StorageSystemRemoteDataPaths.cpp | 4 ++-- .../0_stateless/03000_traverse_shadow_system_data_paths.sh | 7 +------ 4 files changed, 7 insertions(+), 13 deletions(-) diff --git a/src/Disks/IDisk.h b/src/Disks/IDisk.h index 62b02938d1a..fcc92db7b96 100644 --- a/src/Disks/IDisk.h +++ b/src/Disks/IDisk.h @@ -321,11 +321,10 @@ public: }; virtual void getRemotePathsRecursive( - const String &, - std::vector &, - const std::function & /* skip_predicate */ = {}) + const String &, std::vector &, const std::function & /* skip_predicate */) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, + throw Exception( + ErrorCodes::NOT_IMPLEMENTED, "Method `getRemotePathsRecursive() not implemented for disk: {}`", getDataSourceDescription().toString()); } diff --git a/src/Disks/ObjectStorages/DiskObjectStorage.h b/src/Disks/ObjectStorages/DiskObjectStorage.h index d7af656bea3..9f11c0ed02e 100644 --- a/src/Disks/ObjectStorages/DiskObjectStorage.h +++ b/src/Disks/ObjectStorages/DiskObjectStorage.h @@ -51,7 +51,7 @@ public: void getRemotePathsRecursive( const String & local_path, std::vector & paths_map, - const std::function & skip_predicate = {}) override; + const std::function & skip_predicate) override; const std::string & getCacheName() const override { return object_storage->getCacheName(); } diff --git a/src/Storages/System/StorageSystemRemoteDataPaths.cpp b/src/Storages/System/StorageSystemRemoteDataPaths.cpp index 708c1369965..a6263f18492 100644 --- a/src/Storages/System/StorageSystemRemoteDataPaths.cpp +++ b/src/Storages/System/StorageSystemRemoteDataPaths.cpp @@ -58,8 +58,8 @@ Pipe StorageSystemRemoteDataPaths::read( if (disk->isRemote()) { std::vector remote_paths_by_local_path; - disk->getRemotePathsRecursive("store", remote_paths_by_local_path); - disk->getRemotePathsRecursive("data", remote_paths_by_local_path); + disk->getRemotePathsRecursive("store", remote_paths_by_local_path, /* skip_predicate = */ {}); + disk->getRemotePathsRecursive("data", remote_paths_by_local_path, /* skip_predicate = */ {}); if (context->getSettingsRef().traverse_shadow_remote_data_paths) disk->getRemotePathsRecursive( "shadow", diff --git a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh index 2905d7801ca..a1d4b9bba46 100755 --- a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh +++ b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh @@ -26,9 +26,4 @@ ${CLICKHOUSE_CLIENT} --query " FROM system.remote_data_paths WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%' SETTINGS traverse_shadow_remote_data_paths=1;" -${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" &>/dev/null || true -${CLICKHOUSE_CLIENT} --query " - SELECT count() == 0 - FROM system.remote_data_paths - WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%' - SETTINGS traverse_shadow_remote_data_paths=1;" +${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" &>/dev/null From df80c8c9f6ee0939cc6e6e05f3e951511a20f476 Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Tue, 5 Mar 2024 10:43:48 +0300 Subject: [PATCH 0087/1081] Update test reference --- .../03000_traverse_shadow_system_data_paths.reference | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference index e8183f05f5d..6ed281c757a 100644 --- a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference +++ b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.reference @@ -1,3 +1,2 @@ 1 1 -1 From e789d15948eaec3eaa9a8604e24d2f6ed7b60db5 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 5 Mar 2024 16:06:25 +0800 Subject: [PATCH 0088/1081] optimize insertmanyfrom of nullable(number) or nullable(string) --- src/Columns/ColumnDecimal.h | 7 +++++++ src/Columns/ColumnNullable.cpp | 8 ++++++++ src/Columns/ColumnNullable.h | 1 + src/Columns/ColumnString.cpp | 21 +++++++++++++++++++++ src/Columns/ColumnString.h | 2 ++ 5 files changed, 39 insertions(+) diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h index 7ca01a8342c..e0ea26744dc 100644 --- a/src/Columns/ColumnDecimal.h +++ b/src/Columns/ColumnDecimal.h @@ -56,6 +56,13 @@ public: void shrinkToFit() override { data.shrink_to_fit(); } void insertFrom(const IColumn & src, size_t n) override { data.push_back(static_cast(src).getData()[n]); } + + void insertManyFrom(const IColumn & src, size_t position, size_t length) override + { + ValueType v = assert_cast(src).getData()[position]; + data.resize_fill(data.size() + length, v); + } + void insertData(const char * src, size_t /*length*/) override; void insertDefault() override { data.push_back(T()); } void insertManyDefaults(size_t length) override { data.resize_fill(data.size() + length); } diff --git a/src/Columns/ColumnNullable.cpp b/src/Columns/ColumnNullable.cpp index 1d11827ac97..fa5fdfb8c21 100644 --- a/src/Columns/ColumnNullable.cpp +++ b/src/Columns/ColumnNullable.cpp @@ -231,6 +231,14 @@ void ColumnNullable::insertFrom(const IColumn & src, size_t n) getNullMapData().push_back(src_concrete.getNullMapData()[n]); } + +void ColumnNullable::insertManyFrom(const IColumn & src, size_t position, size_t length) +{ + const ColumnNullable & src_concrete = assert_cast(src); + getNestedColumn().insertManyFrom(src_concrete.getNestedColumn(), position, length); + getNullMapColumn().insertManyFrom(src_concrete.getNullMapColumn(), position, length); +} + void ColumnNullable::insertFromNotNullable(const IColumn & src, size_t n) { getNestedColumn().insertFrom(src, n); diff --git a/src/Columns/ColumnNullable.h b/src/Columns/ColumnNullable.h index b4aef8e08fa..ef4bf4fa41b 100644 --- a/src/Columns/ColumnNullable.h +++ b/src/Columns/ColumnNullable.h @@ -69,6 +69,7 @@ public: void insert(const Field & x) override; bool tryInsert(const Field & x) override; void insertFrom(const IColumn & src, size_t n) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; void insertFromNotNullable(const IColumn & src, size_t n); void insertRangeFromNotNullable(const IColumn & src, size_t start, size_t length); diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index b9128372cea..f3c7ac1bf0c 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -38,6 +38,27 @@ ColumnString::ColumnString(const ColumnString & src) last_offset, chars.size()); } +void ColumnString::insertManyFrom(const IColumn & src, size_t position, size_t length) +{ + const ColumnString & src_concrete = assert_cast(src); + const UInt8 * src_buf = &src_concrete.chars[src_concrete.offsets[position - 1]]; + const size_t src_buf_size + = src_concrete.offsets[position] - src_concrete.offsets[position - 1]; /// -1th index is Ok, see PaddedPODArray. + + const size_t old_size = chars.size(); + const size_t new_size = old_size + src_buf_size * length; + chars.resize(new_size); + + const size_t old_rows = offsets.size(); + offsets.resize(old_rows + length); + + for (size_t current_offset = old_size; current_offset < new_size; current_offset += src_buf_size) + memcpySmallAllowReadWriteOverflow15(&chars[current_offset], src_buf, src_buf_size); + + for (size_t i = 0, current_offset = old_size + src_buf_size; i < length; ++i, current_offset += src_buf_size) + offsets[old_rows + i] = current_offset; +} + MutableColumnPtr ColumnString::cloneResized(size_t to_size) const { diff --git a/src/Columns/ColumnString.h b/src/Columns/ColumnString.h index 04aa1849187..2d1d69ced73 100644 --- a/src/Columns/ColumnString.h +++ b/src/Columns/ColumnString.h @@ -160,6 +160,8 @@ public: } } + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + void insertData(const char * pos, size_t length) override { const size_t old_size = chars.size(); From 47ad21dd257ff1a5751d191dfd311a7950a93111 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 5 Mar 2024 12:17:04 +0100 Subject: [PATCH 0089/1081] Remove extra empty line --- .../03002_map_array_functions_with_low_cardinality.sql | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql b/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql index 8240a8f93f5..8820a433da8 100644 --- a/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql +++ b/tests/queries/0_stateless/03002_map_array_functions_with_low_cardinality.sql @@ -1,2 +1 @@ SELECT mapContainsKeyLike(map('aa', toLowCardinality(1), 'bb', toLowCardinality(2)), toLowCardinality('a%')); - From 580fd4ba080df6e29c59b785b1fca0eea76e649c Mon Sep 17 00:00:00 2001 From: Aleksei Filatov Date: Tue, 5 Mar 2024 10:43:48 +0300 Subject: [PATCH 0090/1081] Update test reference --- .../0_stateless/03000_traverse_shadow_system_data_paths.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh index a1d4b9bba46..98575540923 100755 --- a/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh +++ b/tests/queries/0_stateless/03000_traverse_shadow_system_data_paths.sh @@ -26,4 +26,4 @@ ${CLICKHOUSE_CLIENT} --query " FROM system.remote_data_paths WHERE disk_name = 's3_cache' AND local_path LIKE '%shadow/${BACKUP}%' SETTINGS traverse_shadow_remote_data_paths=1;" -${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" &>/dev/null +${CLICKHOUSE_CLIENT} --query "SYSTEM UNFREEZE WITH NAME '${BACKUP}';" &>/dev/null || true From a109952960acac12790cffde030062ec60208994 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 5 Mar 2024 22:08:36 +0800 Subject: [PATCH 0091/1081] dev columnstring --- src/Columns/ColumnArray.cpp | 83 +++++++++++++++++++++++++++++++ src/Columns/ColumnArray.h | 9 ++++ src/Columns/ColumnConst.h | 2 + src/Columns/ColumnFixedString.cpp | 14 ++++++ src/Columns/ColumnFixedString.h | 2 + 5 files changed, 110 insertions(+) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 7b268b80116..b620da81ae8 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -347,6 +347,89 @@ void ColumnArray::insertFrom(const IColumn & src_, size_t n) getOffsets().push_back(getOffsets().back() + size); } +template +void ColumnArray::insertManyFromNumber(const ColumnArray & src, size_t position, size_t length) +{ + using ColVecType = ColumnVectorOrDecimal; + size_t src_size = src.sizeAt(position); + size_t src_offset = src.offsetAt(position); + + const typename ColVecType::Container & src_data = typeid_cast(src.getData()).getData(); + typename ColVecType::Container & data_ref = typeid_cast(getData()).getData(); + size_t old_size = data_ref.size(); + size_t new_size = old_size + src_size * length; + data_ref.resize(new_size); + for (size_t i = 0, offset = old_size; i < length; ++i, offset += src_size) + memcpy(&data_ref[offset], &src_data[src_offset], src_size * sizeof(T)); +} + +void ColumnArray::insertManyFromString(const ColumnArray & src, size_t position, size_t length) +{ + size_t src_size = src.sizeAt(position); + size_t src_offset = src.offsetAt(position); + + const auto & src_string = typeid_cast(src.getData()); + const auto & src_chars = src_string.getChars(); + const auto & src_string_offsets = src_string.getOffsets(); + auto & dst_string = typeid_cast(getData()); + auto & dst_chars = dst_string.getChars(); + auto & dst_string_offsets = dst_string.getOffsets(); + + /// Each row may have multiple strings, copy them to dst_chars and update dst_offsets + size_t old_size = dst_string_offsets.size(); + size_t new_size = old_size + src_size * length; + dst_string_offsets.resize(new_size); + size_t dst_string_offset = dst_chars.size(); + for (size_t i = 0; i < length; ++i) + { + for (size_t j = 0; j < src_size; ++j) + { + size_t nested_offset = src_string_offsets[src_offset + j - 1]; + size_t nested_length = src_string_offsets[src_offset + j] - nested_offset; + + dst_string_offset += nested_length; + dst_string_offsets[old_size + i * src_size + j] = dst_string_offset; + } + } + + size_t chars_to_copy = src_string_offsets[src_offset + src_size - 1] - src_string_offsets[src_offset - 1]; + dst_chars.resize(dst_chars.size() + chars_to_copy * length); + for (size_t dst_offset = old_size; dst_offset < new_size; dst_offset += src_size) + memcpy(&dst_chars[dst_string_offsets[dst_offset - 1]], &src_chars[src_string_offsets[src_offset - 1]], chars_to_copy); +} + +void ColumnArray::insertManyFromTuple(const ColumnArray & src, size_t position, size_t length) +{ + +} +void ColumnArray::insertManyFromNullable(const ColumnArray & src, size_t position, size_t length) +{ + +} +void ColumnArray::insertManyFromGeneric(const ColumnArray & src, size_t position, size_t length) +{ + size_t src_size = src.sizeAt(position); + size_t src_offset = src.offsetAt(position); + const auto & src_data = src.getData(); + size_t new_size = data->size() + src_size * length; + data->reserve(new_size); + for (size_t i = 0; i < length; ++i) + data->insertRangeFrom(src_data, src_offset, src_size); +} + +void ColumnArray::insertManyFrom(const IColumn & src_, size_t position, size_t length) +{ + /// First fill offsets + const ColumnArray & src = assert_cast(src_); + size_t src_size = src.sizeAt(position); + auto & offsets_ref = getOffsets(); + size_t old_rows = offsets_ref.size(); + size_t new_rows = old_rows + length; + size_t old_size = offsets_ref.back(); + offsets_ref.resize(new_rows); + for (size_t i = 0, offset = old_size + src_size; i < length; ++i, offset += src_size) + offsets_ref[old_rows + i] = offset; +} void ColumnArray::insertDefault() { diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 230d8830265..73d632a38b9 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -88,6 +88,7 @@ public: void insert(const Field & x) override; bool tryInsert(const Field & x) override; void insertFrom(const IColumn & src_, size_t n) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; void insertDefault() override; void popBack(size_t n) override; ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; @@ -213,6 +214,14 @@ private: ColumnPtr filterNullable(const Filter & filt, ssize_t result_size_hint) const; ColumnPtr filterGeneric(const Filter & filt, ssize_t result_size_hint) const; + /// Specializations for insertManyFrom + template + void insertManyFromNumber(const ColumnArray & src, size_t position, size_t length); + void insertManyFromString(const ColumnArray & src, size_t position, size_t length); + void insertManyFromTuple(const ColumnArray & src, size_t position, size_t length); + void insertManyFromNullable(const ColumnArray & src, size_t position, size_t length); + void insertManyFromGeneric(const ColumnArray & src, size_t position, size_t length); + int compareAtImpl(size_t n, size_t m, const IColumn & rhs_, int nan_direction_hint, const Collator * collator=nullptr) const; }; diff --git a/src/Columns/ColumnConst.h b/src/Columns/ColumnConst.h index 990b7189fa3..4a3d40ca0d2 100644 --- a/src/Columns/ColumnConst.h +++ b/src/Columns/ColumnConst.h @@ -150,6 +150,8 @@ public: ++s; } + void insertManyFrom(const IColumn & /*src*/, size_t /* position */, size_t length) override { s += length; } + void insertDefault() override { ++s; diff --git a/src/Columns/ColumnFixedString.cpp b/src/Columns/ColumnFixedString.cpp index e460c84d696..b55f68d4687 100644 --- a/src/Columns/ColumnFixedString.cpp +++ b/src/Columns/ColumnFixedString.cpp @@ -85,6 +85,20 @@ void ColumnFixedString::insertFrom(const IColumn & src_, size_t index) memcpySmallAllowReadWriteOverflow15(chars.data() + old_size, &src.chars[n * index], n); } +void ColumnFixedString::insertManyFrom(const IColumn & src, size_t position, size_t length) +{ + const ColumnFixedString & src_concrete = assert_cast(src); + if (n != src_concrete.getN()) + throw Exception(ErrorCodes::SIZE_OF_FIXED_STRING_DOESNT_MATCH, "Size of FixedString doesn't match"); + + const size_t old_size = chars.size(); + const size_t new_size = old_size + n * length; + chars.resize(new_size); + + for (size_t offset = old_size; offset < new_size; offset += n) + memcpySmallAllowReadWriteOverflow15(&chars[offset], &src_concrete.chars[n * position], n); +} + void ColumnFixedString::insertData(const char * pos, size_t length) { if (length > n) diff --git a/src/Columns/ColumnFixedString.h b/src/Columns/ColumnFixedString.h index f40e1356b27..56d42e8b34e 100644 --- a/src/Columns/ColumnFixedString.h +++ b/src/Columns/ColumnFixedString.h @@ -100,6 +100,8 @@ public: void insertFrom(const IColumn & src_, size_t index) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; + void insertData(const char * pos, size_t length) override; void insertDefault() override From aa6b70e5f2187be71b6bce835ecff0aa0c0bfca7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Tue, 5 Mar 2024 16:55:08 +0000 Subject: [PATCH 0092/1081] Add documentation to `simpleJSON` functions --- .../sql-reference/functions/json-functions.md | 392 +++++++++++++++--- 1 file changed, 342 insertions(+), 50 deletions(-) diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index 2c837ff4a42..246cb8972fb 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -5,80 +5,372 @@ sidebar_label: JSON --- There are two sets of functions to parse JSON. - - `visitParam*` (`simpleJSON*`) is made to parse a special very limited subset of a JSON, but these functions are extremely fast. + - `simpleJSON*` (`visitParam*`) is made to parse a special very limited subset of a JSON, but these functions are extremely fast. - `JSONExtract*` is made to parse normal JSON. -# visitParam functions +# simpleJSON/visitParam functions ClickHouse has special functions for working with simplified JSON. All these JSON functions are based on strong assumptions about what the JSON can be, but they try to do as little as possible to get the job done. The following assumptions are made: 1. The field name (function argument) must be a constant. -2. The field name is somehow canonically encoded in JSON. For example: `visitParamHas('{"abc":"def"}', 'abc') = 1`, but `visitParamHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` +2. The field name is somehow canonically encoded in JSON. For example: `simpleJSONHas('{"abc":"def"}', 'abc') = 1`, but `simpleJSONHas('{"\\u0061\\u0062\\u0063":"def"}', 'abc') = 0` 3. Fields are searched for on any nesting level, indiscriminately. If there are multiple matching fields, the first occurrence is used. 4. The JSON does not have space characters outside of string literals. -## visitParamHas(params, name) +## simpleJSONHas -Checks whether there is a field with the `name` name. +Checks whether there is a field named `field_name`. The result is `UInt8`. -Alias: `simpleJSONHas`. +**Syntax** -## visitParamExtractUInt(params, name) - -Parses UInt64 from the value of the field named `name`. If this is a string field, it tries to parse a number from the beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns 0. - -Alias: `simpleJSONExtractUInt`. - -## visitParamExtractInt(params, name) - -The same as for Int64. - -Alias: `simpleJSONExtractInt`. - -## visitParamExtractFloat(params, name) - -The same as for Float64. - -Alias: `simpleJSONExtractFloat`. - -## visitParamExtractBool(params, name) - -Parses a true/false value. The result is UInt8. - -Alias: `simpleJSONExtractBool`. - -## visitParamExtractRaw(params, name) - -Returns the value of a field, including separators. - -Alias: `simpleJSONExtractRaw`. - -Examples: - -``` sql -visitParamExtractRaw('{"abc":"\\n\\u0000"}', 'abc') = '"\\n\\u0000"'; -visitParamExtractRaw('{"abc":{"def":[1,2,3]}}', 'abc') = '{"def":[1,2,3]}'; +```sql +simpleJSONHas(json, field_name) ``` -## visitParamExtractString(params, name) +**Parameters** -Parses the string in double quotes. The value is unescaped. If unescaping failed, it returns an empty string. +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) -Alias: `simpleJSONExtractString`. +**Returned value** -Examples: +It returns `1` if the field exists, `0` otherwise. -``` sql -visitParamExtractString('{"abc":"\\n\\u0000"}', 'abc') = '\n\0'; -visitParamExtractString('{"abc":"\\u263a"}', 'abc') = '☺'; -visitParamExtractString('{"abc":"\\u263"}', 'abc') = ''; -visitParamExtractString('{"abc":"hello}', 'abc') = ''; +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"true","qux":1}'); + +SELECT simpleJSONHas(json, 'foo') FROM jsons; +SELECT simpleJSONHas(json, 'bar') FROM jsons; ``` +```response +1 +0 +``` +## simpleJSONExtractUInt + +Parses `UInt64` from the value of the field named `field_name`. If this is a string field, it tries to parse a number from the beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns `0`. + +**Syntax** + +```sql +simpleJSONExtractUInt(json, field_name) +``` + +**Parameters** + +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) + +**Returned value** + +It returns the number parsed from the field if the field exists and contains a number, `0` otherwise. + +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"4e3"}'); +INSERT INTO jsons VALUES ('{"foo":3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":"not1number"}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractUInt(json, 'foo') FROM jsons ORDER BY json; +``` + +```response +0 +4 +0 +3 +5 +``` + +## simpleJSONExtractInt + +Parses `Int64` from the value of the field named `field_name`. If this is a string field, it tries to parse a number from the beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns `0`. + +**Syntax** + +```sql +simpleJSONExtractInt(json, field_name) +``` + +**Parameters** + +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) + +**Returned value** + +It returns the number parsed from the field if the field exists and contains a number, `0` otherwise. + +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"-4e3"}'); +INSERT INTO jsons VALUES ('{"foo":-3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":"not1number"}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractInt(json, 'foo') FROM jsons ORDER BY json; +``` + +```response +0 +-4 +0 +-3 +5 +``` + +## simpleJSONExtractFloat + +Parses `Float64` from the value of the field named `field_name`. If this is a string field, it tries to parse a number from the beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns `0`. + +**Syntax** + +```sql +simpleJSONExtractFloat(json, field_name) +``` + +**Parameters** + +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) + +**Returned value** + +It returns the number parsed from the field if the field exists and contains a number, `0` otherwise. + +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"-4e3"}'); +INSERT INTO jsons VALUES ('{"foo":-3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":"not1number"}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractFloat(json, 'foo') FROM jsons ORDER BY json; +``` + +```response +0 +-4000 +0 +-3.4 +5 +``` + +## simpleJSONExtractBool + +Parses a true/false value from the value of the field named `field_name`. The result is `UInt8`. + +**Syntax** + +```sql +simpleJSONExtractBool(json, field_name) +``` + +**Parameters** + +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) + +**Returned value** + +It returns `1` if the value of the field is `true`, `0` otherwise. This means this function will return `0` including (and not only) in the following cases: + - If the field doesn't exists. + - If the field contains `true` as a string, e.g.: `{"field":"true"}`. + - If the field contains `1` as a numerical value. + +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":false,"bar":true}'); +INSERT INTO jsons VALUES ('{"foo":"true","qux":1}'); + +SELECT simpleJSONExtractBool(json, 'bar') FROM jsons ORDER BY json; +SELECT simpleJSONExtractBool(json, 'foo') FROM jsons ORDER BY json; +``` + +```response +0 +1 +0 +0 +``` + +## simpleJSONExtractRaw + +Returns the value of the field named `field_name` as a `String`, including separators. + +**Syntax** + +```sql +simpleJSONExtractRaw(json, field_name) +``` + +**Parameters** + +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) + +**Returned value** + +It returns the value of the field as a [`String`](../../sql-reference/data-types/string.md#string), including separators if the field exists, or an emtpy `String` otherwise. + +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"-4e3"}'); +INSERT INTO jsons VALUES ('{"foo":-3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":{"def":[1,2,3]}}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractRaw(json, 'foo') FROM jsons ORDER BY json; +``` + +```response + +"-4e3" +-3.4 +5 +{"def":[1,2,3]} +``` + +## simpleJSONExtractString + +Parses `String` in double quotes from the value of the field named `field_name`. + +**Syntax** + +```sql +simpleJSONExtractString(json, field_name) +``` + +**Parameters** + +- `json`: The JSON in which the field is searched for. [String](../../sql-reference/data-types/string.md#string) +- `field_name`: The name of the field to search for. [String literal](../syntax#string) + +**Returned value** + +It returns the value of a field as a [`String`](../../sql-reference/data-types/string.md#string), including separators. The value is unescaped. It returns an empty `String`: if the field doesn't contain a double quoted string, if unescaping fails or if the field doesn't exist. + +**Implementation details** + There is currently no support for code points in the format `\uXXXX\uYYYY` that are not from the basic multilingual plane (they are converted to CESU-8 instead of UTF-8). +**Example** + +Query: + +```sql +CREATE TABLE jsons +( + `json` String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"\\n\\u0000"}'); +INSERT INTO jsons VALUES ('{"foo":"\\u263"}'); +INSERT INTO jsons VALUES ('{"foo":"\\u263a"}'); +INSERT INTO jsons VALUES ('{"foo":"hello}'); + +SELECT simpleJSONExtractString(json, 'foo') FROM jsons ORDER BY json; +``` + +```response +\n\0 + +☺ + +``` + +## visitParamHas + +This function is [an alias of `simpleJSONHas`](./json-functions#simplejsonhas). + +## visitParamExtractUInt + +This function is [an alias of `simpleJSONExtractUInt`](./json-functions#simplejsonextractuint). + +## visitParamExtractInt + +This function is [an alias of `simpleJSONExtractInt`](./json-functions#simplejsonextractint). + +## visitParamExtractFloat + +This function is [an alias of `simpleJSONExtractFloat`](./json-functions#simplejsonextractfloat). + +## visitParamExtractBool + +This function is [an alias of `simpleJSONExtractBool`](./json-functions#simplejsonextractbool). + +## visitParamExtractRaw + +This function is [an alias of `simpleJSONExtractRaw`](./json-functions#simplejsonextractraw). + +## visitParamExtractString + +This function is [an alias of `simpleJSONExtractString`](./json-functions#simplejsonextractstring). + # JSONExtract functions The following functions are based on [simdjson](https://github.com/lemire/simdjson) designed for more complex JSON parsing requirements. From 981c507d8007a4f7761a83a2ecfa0956a364317d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Tue, 5 Mar 2024 17:01:54 +0000 Subject: [PATCH 0093/1081] Add example to `sin`. --- docs/en/sql-reference/functions/math-functions.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/docs/en/sql-reference/functions/math-functions.md b/docs/en/sql-reference/functions/math-functions.md index b27668caf0c..fc659891b5c 100644 --- a/docs/en/sql-reference/functions/math-functions.md +++ b/docs/en/sql-reference/functions/math-functions.md @@ -299,6 +299,18 @@ sin(x) Type: [Float*](../../sql-reference/data-types/float.md). +**Example** + +Query: + +```sql +SELECT sin(1.23); +``` + +```response +0.9424888019316975 +``` + ## cos Returns the cosine of the argument. From d529389522311e7bca11a3beebc07e0439efcfb4 Mon Sep 17 00:00:00 2001 From: Zach Naimon Date: Tue, 5 Mar 2024 14:20:20 -0500 Subject: [PATCH 0094/1081] Add support for 'START TRANSACTION' syntax --- src/Parsers/ParserTransactionControl.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Parsers/ParserTransactionControl.cpp b/src/Parsers/ParserTransactionControl.cpp index da593170002..fc3077bb0b6 100644 --- a/src/Parsers/ParserTransactionControl.cpp +++ b/src/Parsers/ParserTransactionControl.cpp @@ -14,6 +14,8 @@ bool ParserTransactionControl::parseImpl(Pos & pos, ASTPtr & node, Expected & ex if (ParserKeyword("BEGIN TRANSACTION").ignore(pos, expected)) action = ASTTransactionControl::BEGIN; + else if (ParserKeyword("START TRANSACTION").ignore(pos, expected)) + action = ASTTransactionControl::BEGIN; else if (ParserKeyword("COMMIT").ignore(pos, expected)) action = ASTTransactionControl::COMMIT; else if (ParserKeyword("ROLLBACK").ignore(pos, expected)) From d596de73847c712d766e0f8598ce43ed078ec968 Mon Sep 17 00:00:00 2001 From: Zach Naimon Date: Tue, 5 Mar 2024 14:25:48 -0500 Subject: [PATCH 0095/1081] update documentation --- docs/en/sql-reference/transactions.md | 42 +++++++++++++++++++++------ 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/docs/en/sql-reference/transactions.md b/docs/en/sql-reference/transactions.md index cb89a091d68..b9c9afc20f9 100644 --- a/docs/en/sql-reference/transactions.md +++ b/docs/en/sql-reference/transactions.md @@ -1,26 +1,29 @@ --- slug: /en/guides/developer/transactional --- + # Transactional (ACID) support -## Case 1: INSERT into one partition, of one table, of the MergeTree* family +## Case 1: INSERT into one partition, of one table, of the MergeTree\* family This is transactional (ACID) if the inserted rows are packed and inserted as a single block (see Notes): + - Atomic: an INSERT succeeds or is rejected as a whole: if a confirmation is sent to the client, then all rows were inserted; if an error is sent to the client, then no rows were inserted. - Consistent: if there are no table constraints violated, then all rows in an INSERT are inserted and the INSERT succeeds; if constraints are violated, then no rows are inserted. - Isolated: concurrent clients observe a consistent snapshot of the table–the state of the table either as it was before the INSERT attempt, or after the successful INSERT; no partial state is seen - Durable: a successful INSERT is written to the filesystem before answering to the client, on a single replica or multiple replicas (controlled by the `insert_quorum` setting), and ClickHouse can ask the OS to sync the filesystem data on the storage media (controlled by the `fsync_after_insert` setting). - INSERT into multiple tables with one statement is possible if materialized views are involved (the INSERT from the client is to a table which has associate materialized views). -## Case 2: INSERT into multiple partitions, of one table, of the MergeTree* family +## Case 2: INSERT into multiple partitions, of one table, of the MergeTree\* family Same as Case 1 above, with this detail: + - If table has many partitions and INSERT covers many partitions–then insertion into every partition is transactional on its own - -## Case 3: INSERT into one distributed table of the MergeTree* family +## Case 3: INSERT into one distributed table of the MergeTree\* family Same as Case 1 above, with this detail: + - INSERT into Distributed table is not transactional as a whole, while insertion into every shard is transactional ## Case 4: Using a Buffer table @@ -30,9 +33,11 @@ Same as Case 1 above, with this detail: ## Case 5: Using async_insert Same as Case 1 above, with this detail: + - atomicity is ensured even if `async_insert` is enabled and `wait_for_async_insert` is set to 1 (the default), but if `wait_for_async_insert` is set to 0, then atomicity is not ensured. ## Notes + - rows inserted from the client in some data format are packed into a single block when: - the insert format is row-based (like CSV, TSV, Values, JSONEachRow, etc) and the data contains less then `max_insert_block_size` rows (~1 000 000 by default) or less then `min_chunk_bytes_for_parallel_parsing` bytes (10 MB by default) in case of parallel parsing is used (enabled by default) - the insert format is column-based (like Native, Parquet, ORC, etc) and the data contains only one block of data @@ -61,8 +66,9 @@ In addition to the functionality described at the top of this document, ClickHou ``` ### Notes + - This is an experimental feature, and changes should be expected. -- If an exception occurs during a transaction, you cannot commit the transaction. This includes all exceptions, including `UNKNOWN_FUNCTION` exceptions caused by typos. +- If an exception occurs during a transaction, you cannot commit the transaction. This includes all exceptions, including `UNKNOWN_FUNCTION` exceptions caused by typos. - Nested transactions are not supported; finish the current transaction and start a new one instead ### Configuration @@ -80,7 +86,7 @@ These examples are with a single node ClickHouse server with ClickHouse Keeper e #### Basic configuration for a single ClickHouse server node with ClickHouse Keeper enabled :::note -See the [deployment](docs/en/deployment-guides/terminology.md) documentation for details on deploying ClickHouse server and a proper quorum of ClickHouse Keeper nodes. The configuration shown here is for experimental purposes. +See the [deployment](docs/en/deployment-guides/terminology.md) documentation for details on deploying ClickHouse server and a proper quorum of ClickHouse Keeper nodes. The configuration shown here is for experimental purposes. ::: ```xml title=/etc/clickhouse-server/config.d/config.xml @@ -127,17 +133,19 @@ See the [deployment](docs/en/deployment-guides/terminology.md) documentation for #### Verify that experimental transactions are enabled -Issue a `BEGIN TRANSACTION` followed by a `ROLLBACK` to verify that experimental transactions are enabled, and that ClickHouse Keeper is enabled as it is used to track transactions. +Issue a `BEGIN TRANSACTION` or `START TRANSACTION` followed by a `ROLLBACK` to verify that experimental transactions are enabled, and that ClickHouse Keeper is enabled as it is used to track transactions. ```sql BEGIN TRANSACTION ``` + ```response Ok. ``` :::tip If you see the following error, then check your configuration file to make sure that `allow_experimental_transactions` is set to `1` (or any value other than `0` or `false`). + ``` Code: 48. DB::Exception: Received from localhost:9000. DB::Exception: Transactions are not supported. @@ -145,15 +153,18 @@ DB::Exception: Transactions are not supported. ``` You can also check ClickHouse Keeper by issuing + ``` echo ruok | nc localhost 9181 ``` + ClickHouse Keeper should respond with `imok`. ::: ```sql ROLLBACK ``` + ```response Ok. ``` @@ -161,7 +172,7 @@ Ok. #### Create a table for testing :::tip -Creation of tables is not transactional. Run this DDL query outside of a transaction. +Creation of tables is not transactional. Run this DDL query outside of a transaction. ::: ```sql @@ -172,6 +183,7 @@ CREATE TABLE mergetree_table ENGINE = MergeTree ORDER BY n ``` + ```response Ok. ``` @@ -181,6 +193,7 @@ Ok. ```sql BEGIN TRANSACTION ``` + ```response Ok. ``` @@ -188,6 +201,7 @@ Ok. ```sql INSERT INTO mergetree_table FORMAT Values (10) ``` + ```response Ok. ``` @@ -196,11 +210,13 @@ Ok. SELECT * FROM mergetree_table ``` + ```response ┌──n─┐ │ 10 │ └────┘ ``` + :::note You can query the table from within a transaction and see that the row was inserted even though it has not yet been committed. ::: @@ -208,16 +224,20 @@ You can query the table from within a transaction and see that the row was inser #### Rollback the transaction, and query the table again Verify that the transaction is rolled back: + ```sql ROLLBACK ``` + ```response Ok. ``` + ```sql SELECT * FROM mergetree_table ``` + ```response Ok. @@ -229,6 +249,7 @@ Ok. ```sql BEGIN TRANSACTION ``` + ```response Ok. ``` @@ -236,6 +257,7 @@ Ok. ```sql INSERT INTO mergetree_table FORMAT Values (42) ``` + ```response Ok. ``` @@ -243,6 +265,7 @@ Ok. ```sql COMMIT ``` + ```response Ok. Elapsed: 0.002 sec. ``` @@ -251,6 +274,7 @@ Ok. Elapsed: 0.002 sec. SELECT * FROM mergetree_table ``` + ```response ┌──n─┐ │ 42 │ @@ -267,6 +291,7 @@ SELECT * FROM system.transactions FORMAT Vertical ``` + ```response Row 1: ────── @@ -280,4 +305,3 @@ state: RUNNING ## More Details See this [meta issue](https://github.com/ClickHouse/ClickHouse/issues/48794) to find much more extensive tests and to keep up to date with the progress. - From 34d327a08cc2de20510eb429c2da30b4d135a3f0 Mon Sep 17 00:00:00 2001 From: Zach Naimon Date: Tue, 5 Mar 2024 14:26:57 -0500 Subject: [PATCH 0096/1081] Revert "update documentation" This reverts commit d596de73847c712d766e0f8598ce43ed078ec968. --- docs/en/sql-reference/transactions.md | 42 ++++++--------------------- 1 file changed, 9 insertions(+), 33 deletions(-) diff --git a/docs/en/sql-reference/transactions.md b/docs/en/sql-reference/transactions.md index b9c9afc20f9..cb89a091d68 100644 --- a/docs/en/sql-reference/transactions.md +++ b/docs/en/sql-reference/transactions.md @@ -1,29 +1,26 @@ --- slug: /en/guides/developer/transactional --- - # Transactional (ACID) support -## Case 1: INSERT into one partition, of one table, of the MergeTree\* family +## Case 1: INSERT into one partition, of one table, of the MergeTree* family This is transactional (ACID) if the inserted rows are packed and inserted as a single block (see Notes): - - Atomic: an INSERT succeeds or is rejected as a whole: if a confirmation is sent to the client, then all rows were inserted; if an error is sent to the client, then no rows were inserted. - Consistent: if there are no table constraints violated, then all rows in an INSERT are inserted and the INSERT succeeds; if constraints are violated, then no rows are inserted. - Isolated: concurrent clients observe a consistent snapshot of the table–the state of the table either as it was before the INSERT attempt, or after the successful INSERT; no partial state is seen - Durable: a successful INSERT is written to the filesystem before answering to the client, on a single replica or multiple replicas (controlled by the `insert_quorum` setting), and ClickHouse can ask the OS to sync the filesystem data on the storage media (controlled by the `fsync_after_insert` setting). - INSERT into multiple tables with one statement is possible if materialized views are involved (the INSERT from the client is to a table which has associate materialized views). -## Case 2: INSERT into multiple partitions, of one table, of the MergeTree\* family +## Case 2: INSERT into multiple partitions, of one table, of the MergeTree* family Same as Case 1 above, with this detail: - - If table has many partitions and INSERT covers many partitions–then insertion into every partition is transactional on its own -## Case 3: INSERT into one distributed table of the MergeTree\* family + +## Case 3: INSERT into one distributed table of the MergeTree* family Same as Case 1 above, with this detail: - - INSERT into Distributed table is not transactional as a whole, while insertion into every shard is transactional ## Case 4: Using a Buffer table @@ -33,11 +30,9 @@ Same as Case 1 above, with this detail: ## Case 5: Using async_insert Same as Case 1 above, with this detail: - - atomicity is ensured even if `async_insert` is enabled and `wait_for_async_insert` is set to 1 (the default), but if `wait_for_async_insert` is set to 0, then atomicity is not ensured. ## Notes - - rows inserted from the client in some data format are packed into a single block when: - the insert format is row-based (like CSV, TSV, Values, JSONEachRow, etc) and the data contains less then `max_insert_block_size` rows (~1 000 000 by default) or less then `min_chunk_bytes_for_parallel_parsing` bytes (10 MB by default) in case of parallel parsing is used (enabled by default) - the insert format is column-based (like Native, Parquet, ORC, etc) and the data contains only one block of data @@ -66,9 +61,8 @@ In addition to the functionality described at the top of this document, ClickHou ``` ### Notes - - This is an experimental feature, and changes should be expected. -- If an exception occurs during a transaction, you cannot commit the transaction. This includes all exceptions, including `UNKNOWN_FUNCTION` exceptions caused by typos. +- If an exception occurs during a transaction, you cannot commit the transaction. This includes all exceptions, including `UNKNOWN_FUNCTION` exceptions caused by typos. - Nested transactions are not supported; finish the current transaction and start a new one instead ### Configuration @@ -86,7 +80,7 @@ These examples are with a single node ClickHouse server with ClickHouse Keeper e #### Basic configuration for a single ClickHouse server node with ClickHouse Keeper enabled :::note -See the [deployment](docs/en/deployment-guides/terminology.md) documentation for details on deploying ClickHouse server and a proper quorum of ClickHouse Keeper nodes. The configuration shown here is for experimental purposes. +See the [deployment](docs/en/deployment-guides/terminology.md) documentation for details on deploying ClickHouse server and a proper quorum of ClickHouse Keeper nodes. The configuration shown here is for experimental purposes. ::: ```xml title=/etc/clickhouse-server/config.d/config.xml @@ -133,19 +127,17 @@ See the [deployment](docs/en/deployment-guides/terminology.md) documentation for #### Verify that experimental transactions are enabled -Issue a `BEGIN TRANSACTION` or `START TRANSACTION` followed by a `ROLLBACK` to verify that experimental transactions are enabled, and that ClickHouse Keeper is enabled as it is used to track transactions. +Issue a `BEGIN TRANSACTION` followed by a `ROLLBACK` to verify that experimental transactions are enabled, and that ClickHouse Keeper is enabled as it is used to track transactions. ```sql BEGIN TRANSACTION ``` - ```response Ok. ``` :::tip If you see the following error, then check your configuration file to make sure that `allow_experimental_transactions` is set to `1` (or any value other than `0` or `false`). - ``` Code: 48. DB::Exception: Received from localhost:9000. DB::Exception: Transactions are not supported. @@ -153,18 +145,15 @@ DB::Exception: Transactions are not supported. ``` You can also check ClickHouse Keeper by issuing - ``` echo ruok | nc localhost 9181 ``` - ClickHouse Keeper should respond with `imok`. ::: ```sql ROLLBACK ``` - ```response Ok. ``` @@ -172,7 +161,7 @@ Ok. #### Create a table for testing :::tip -Creation of tables is not transactional. Run this DDL query outside of a transaction. +Creation of tables is not transactional. Run this DDL query outside of a transaction. ::: ```sql @@ -183,7 +172,6 @@ CREATE TABLE mergetree_table ENGINE = MergeTree ORDER BY n ``` - ```response Ok. ``` @@ -193,7 +181,6 @@ Ok. ```sql BEGIN TRANSACTION ``` - ```response Ok. ``` @@ -201,7 +188,6 @@ Ok. ```sql INSERT INTO mergetree_table FORMAT Values (10) ``` - ```response Ok. ``` @@ -210,13 +196,11 @@ Ok. SELECT * FROM mergetree_table ``` - ```response ┌──n─┐ │ 10 │ └────┘ ``` - :::note You can query the table from within a transaction and see that the row was inserted even though it has not yet been committed. ::: @@ -224,20 +208,16 @@ You can query the table from within a transaction and see that the row was inser #### Rollback the transaction, and query the table again Verify that the transaction is rolled back: - ```sql ROLLBACK ``` - ```response Ok. ``` - ```sql SELECT * FROM mergetree_table ``` - ```response Ok. @@ -249,7 +229,6 @@ Ok. ```sql BEGIN TRANSACTION ``` - ```response Ok. ``` @@ -257,7 +236,6 @@ Ok. ```sql INSERT INTO mergetree_table FORMAT Values (42) ``` - ```response Ok. ``` @@ -265,7 +243,6 @@ Ok. ```sql COMMIT ``` - ```response Ok. Elapsed: 0.002 sec. ``` @@ -274,7 +251,6 @@ Ok. Elapsed: 0.002 sec. SELECT * FROM mergetree_table ``` - ```response ┌──n─┐ │ 42 │ @@ -291,7 +267,6 @@ SELECT * FROM system.transactions FORMAT Vertical ``` - ```response Row 1: ────── @@ -305,3 +280,4 @@ state: RUNNING ## More Details See this [meta issue](https://github.com/ClickHouse/ClickHouse/issues/48794) to find much more extensive tests and to keep up to date with the progress. + From f5cf61c32e9f511933b56048f22aaf43fad67a6c Mon Sep 17 00:00:00 2001 From: Zach Naimon Date: Tue, 5 Mar 2024 14:28:48 -0500 Subject: [PATCH 0097/1081] update documentation without autoformat --- docs/en/sql-reference/transactions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/transactions.md b/docs/en/sql-reference/transactions.md index cb89a091d68..09cdc192b03 100644 --- a/docs/en/sql-reference/transactions.md +++ b/docs/en/sql-reference/transactions.md @@ -127,7 +127,7 @@ See the [deployment](docs/en/deployment-guides/terminology.md) documentation for #### Verify that experimental transactions are enabled -Issue a `BEGIN TRANSACTION` followed by a `ROLLBACK` to verify that experimental transactions are enabled, and that ClickHouse Keeper is enabled as it is used to track transactions. +Issue a `BEGIN TRANSACTION` or `START TRANSACTION` followed by a `ROLLBACK` to verify that experimental transactions are enabled, and that ClickHouse Keeper is enabled as it is used to track transactions. ```sql BEGIN TRANSACTION From b074477ffb841587c19313063c249bc4c35ef301 Mon Sep 17 00:00:00 2001 From: Zach Naimon Date: Tue, 5 Mar 2024 14:45:09 -0500 Subject: [PATCH 0098/1081] use new syntax in a test --- tests/integration/test_transactions/test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_transactions/test.py b/tests/integration/test_transactions/test.py index 46660581223..584e59ba71b 100644 --- a/tests/integration/test_transactions/test.py +++ b/tests/integration/test_transactions/test.py @@ -67,8 +67,8 @@ def test_rollback_unfinished_on_restart1(start_cluster): tx(1, "insert into mt values (5, 50)") tx(1, "alter table mt update m = m+n in partition id '1' where 1") - # check that uncommitted insert will be rolled back on restart - tx(3, "begin transaction") + # check that uncommitted insert will be rolled back on restart (using `START TRANSACTION syntax`) + tx(3, "start transaction") tid5 = tx(3, "select transactionID()").strip() tx(3, "insert into mt values (6, 70)") From 7ed1be6f3c51db3bd4abb57c429c77e09b0cd8a8 Mon Sep 17 00:00:00 2001 From: Zach Naimon Date: Tue, 5 Mar 2024 14:46:39 -0500 Subject: [PATCH 0099/1081] fix test comment --- tests/integration/test_transactions/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_transactions/test.py b/tests/integration/test_transactions/test.py index 584e59ba71b..d63b7b6f545 100644 --- a/tests/integration/test_transactions/test.py +++ b/tests/integration/test_transactions/test.py @@ -67,7 +67,7 @@ def test_rollback_unfinished_on_restart1(start_cluster): tx(1, "insert into mt values (5, 50)") tx(1, "alter table mt update m = m+n in partition id '1' where 1") - # check that uncommitted insert will be rolled back on restart (using `START TRANSACTION syntax`) + # check that uncommitted insert will be rolled back on restart (using `START TRANSACTION` syntax) tx(3, "start transaction") tid5 = tx(3, "select transactionID()").strip() tx(3, "insert into mt values (6, 70)") From 53c9d4513c4b93ed79df305bb5c36c0cfb43ef79 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Mar 2024 12:16:17 +0800 Subject: [PATCH 0100/1081] finish dev column array --- src/Columns/ColumnArray.cpp | 132 +++++++++++++++++++++++++++++++++--- src/Columns/ColumnArray.h | 3 + 2 files changed, 125 insertions(+), 10 deletions(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index b620da81ae8..aa0d5aa3e50 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -31,6 +31,7 @@ namespace ErrorCodes extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; extern const int LOGICAL_ERROR; extern const int TOO_LARGE_ARRAY_SIZE; + extern const int ILLEGAL_COLUMN; } /** Obtaining array as Field can be slow for large arrays and consume vast amount of memory. @@ -363,6 +364,19 @@ void ColumnArray::insertManyFromNumber(const ColumnArray & src, size_t position, memcpy(&data_ref[offset], &src_data[src_offset], src_size * sizeof(T)); } +void ColumnArray::insertManyFromConst(const ColumnConst & src, size_t position, size_t length) +{ + const ColumnArray * src_array = typeid_cast(&src.getDataColumn()); + if (!src_array) + throw Exception( + ErrorCodes::ILLEGAL_COLUMN, + "Cannot insert from const column of type {} to column of type {}", + src.getDataColumn().getName(), + getName()); + + insertManyFromImpl(*src_array, 0, length, true); +} + void ColumnArray::insertManyFromString(const ColumnArray & src, size_t position, size_t length) { size_t src_size = src.sizeAt(position); @@ -400,12 +414,53 @@ void ColumnArray::insertManyFromString(const ColumnArray & src, size_t position, void ColumnArray::insertManyFromTuple(const ColumnArray & src, size_t position, size_t length) { + ColumnTuple & tuple = assert_cast(getData()); + const ColumnTuple & src_tuple = assert_cast(src.getData()); + /// Make temporary arrays for each components of Tuple. In the same way as for Nullable. + size_t tuple_size = tuple.tupleSize(); + size_t src_tuple_size = src_tuple.tupleSize(); + if (tuple_size == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Empty tuple"); + if (tuple_size != src_tuple_size) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Nested tuple size mismatch: {} vs {}", tuple_size, src_tuple_size); + + Columns temporary_arrays(tuple_size); + Columns src_temporary_arrays(tuple_size); + for (size_t i = 0; i < tuple_size; ++i) + { + temporary_arrays[i] = ColumnArray::create(tuple.getColumn(i).assumeMutable(), getOffsetsPtr()->assumeMutable()); + src_temporary_arrays[i] = ColumnArray::create(src_tuple.getColumn(i).assumeMutable(), src.getOffsetsPtr()->assumeMutable()); + assert_cast(*temporary_arrays[i]) + .insertManyFromImpl(assert_cast(*src_temporary_arrays[i]), position, length, false); + } + + Columns tuple_columns(tuple_size); + for (size_t i = 0; i < tuple_size; ++i) + tuple_columns[i] = assert_cast(*temporary_arrays[i]).getDataPtr(); + + getDataPtr() = ColumnTuple::create(std::move(tuple_columns)); } + void ColumnArray::insertManyFromNullable(const ColumnArray & src, size_t position, size_t length) { + ColumnNullable & nullable = assert_cast(getData()); + const ColumnNullable & src_nullable = assert_cast(src.getData()); + /// Process nested column without updating array offsets + auto array_of_nested = ColumnArray(nullable.getNestedColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()); + auto src_array_of_nested = ColumnArray(src_nullable.getNestedColumnPtr()->assumeMutable(), src.getOffsetsPtr()->assumeMutable()); + array_of_nested.insertManyFromImpl(src_array_of_nested, position, length, false); + + /// Process null map column without updating array offsets + auto array_of_null_map = ColumnArray(nullable.getNullMapColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()); + auto src_array_of_null_map = ColumnArray(src_nullable.getNullMapColumnPtr()->assumeMutable(), src.getOffsetsPtr()->assumeMutable()); + array_of_null_map.insertManyFromImpl(src_array_of_null_map, position, length, false); + + /// Update array data + getDataPtr() = ColumnNullable::create(array_of_nested.getDataPtr(), array_of_null_map.getDataPtr()); } + void ColumnArray::insertManyFromGeneric(const ColumnArray & src, size_t position, size_t length) { size_t src_size = src.sizeAt(position); @@ -419,16 +474,73 @@ void ColumnArray::insertManyFromGeneric(const ColumnArray & src, size_t position void ColumnArray::insertManyFrom(const IColumn & src_, size_t position, size_t length) { - /// First fill offsets - const ColumnArray & src = assert_cast(src_); - size_t src_size = src.sizeAt(position); - auto & offsets_ref = getOffsets(); - size_t old_rows = offsets_ref.size(); - size_t new_rows = old_rows + length; - size_t old_size = offsets_ref.back(); - offsets_ref.resize(new_rows); - for (size_t i = 0, offset = old_size + src_size; i < length; ++i, offset += src_size) - offsets_ref[old_rows + i] = offset; + const ColumnConst * src_const = typeid_cast(&src_); + if (src_const) + return insertManyFromConst(*src_const, position, length); + + const ColumnArray * src_array = typeid_cast(&src_); + if (!src_array) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot insert from column of type {} to column of type {}", src_.getName(), getName()); + + return insertManyFromImpl(*src_array, position, length, true); +} + +void ColumnArray::insertManyFromImpl(const ColumnArray & src, size_t position, size_t length, bool update_offsets) +{ + /// First fill offsets if needed + if (update_offsets) + { + size_t src_size = src.sizeAt(position); + auto & offsets_ref = getOffsets(); + size_t old_rows = offsets_ref.size(); + size_t new_rows = old_rows + length; + size_t old_size = offsets_ref.back(); + offsets_ref.resize(new_rows); + for (size_t i = 0, offset = old_size + src_size; i < length; ++i, offset += src_size) + offsets_ref[old_rows + i] = offset; + } + + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast *>(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast *>(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast *>(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast *>(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast *>(data.get())) + return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromNullable(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromTuple(src, position, length); + return insertManyFromGeneric(src, position, length); } void ColumnArray::insertDefault() diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 73d632a38b9..765f86ec552 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -215,6 +215,9 @@ private: ColumnPtr filterGeneric(const Filter & filt, ssize_t result_size_hint) const; /// Specializations for insertManyFrom + void insertManyFromConst(const ColumnConst & src, size_t position, size_t length); + void insertManyFromImpl(const ColumnArray & src, size_t position, size_t length, bool update_offsets = true); + template void insertManyFromNumber(const ColumnArray & src, size_t position, size_t length); void insertManyFromString(const ColumnArray & src, size_t position, size_t length); From 3bf3c7cc708d1a564896d649a1a804b868f89d8d Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Mar 2024 12:32:23 +0800 Subject: [PATCH 0101/1081] finish column map and tuple --- src/Columns/ColumnArray.cpp | 2 +- src/Columns/ColumnMap.cpp | 5 +++++ src/Columns/ColumnMap.h | 1 + src/Columns/ColumnTuple.cpp | 12 ++++++++++++ src/Columns/ColumnTuple.h | 1 + 5 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index aa0d5aa3e50..5b0df8e9b6b 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -364,7 +364,7 @@ void ColumnArray::insertManyFromNumber(const ColumnArray & src, size_t position, memcpy(&data_ref[offset], &src_data[src_offset], src_size * sizeof(T)); } -void ColumnArray::insertManyFromConst(const ColumnConst & src, size_t position, size_t length) +void ColumnArray::insertManyFromConst(const ColumnConst & src, size_t /*position*/, size_t length) { const ColumnArray * src_array = typeid_cast(&src.getDataColumn()); if (!src_array) diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp index 995f3103484..57e8ba685b4 100644 --- a/src/Columns/ColumnMap.cpp +++ b/src/Columns/ColumnMap.cpp @@ -158,6 +158,11 @@ void ColumnMap::insertFrom(const IColumn & src, size_t n) nested->insertFrom(assert_cast(src).getNestedColumn(), n); } +void ColumnMap::insertManyFrom(const IColumn & src, size_t position, size_t length) +{ + assert_cast(*nested).insertManyFrom(assert_cast(src).getNestedColumn(), position, length); +} + void ColumnMap::insertRangeFrom(const IColumn & src, size_t start, size_t length) { nested->insertRangeFrom( diff --git a/src/Columns/ColumnMap.h b/src/Columns/ColumnMap.h index 17cd86a3788..60aa69e7bf6 100644 --- a/src/Columns/ColumnMap.h +++ b/src/Columns/ColumnMap.h @@ -67,6 +67,7 @@ public: void updateWeakHash32(WeakHash32 & hash) const override; void updateHashFast(SipHash & hash) const override; void insertFrom(const IColumn & src_, size_t n) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; void insertRangeFrom(const IColumn & src, size_t start, size_t length) override; ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override; void expand(const Filter & mask, bool inverted) override; diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp index 17cc58d92f5..062bdadf9d2 100644 --- a/src/Columns/ColumnTuple.cpp +++ b/src/Columns/ColumnTuple.cpp @@ -185,6 +185,18 @@ void ColumnTuple::insertFrom(const IColumn & src_, size_t n) columns[i]->insertFrom(*src.columns[i], n); } +void ColumnTuple::insertManyFrom(const IColumn & src, size_t position, size_t length) +{ + const ColumnTuple & src_tuple = assert_cast(src); + + const size_t tuple_size = columns.size(); + if (src_tuple.columns.size() != tuple_size) + throw Exception(ErrorCodes::CANNOT_INSERT_VALUE_OF_DIFFERENT_SIZE_INTO_TUPLE, "Cannot insert value of different size into tuple"); + + for (size_t i = 0; i < tuple_size; ++i) + columns[i]->insertManyFrom(*src_tuple.columns[i], position, length); +} + void ColumnTuple::insertDefault() { for (auto & column : columns) diff --git a/src/Columns/ColumnTuple.h b/src/Columns/ColumnTuple.h index 610416b8b11..5b626155754 100644 --- a/src/Columns/ColumnTuple.h +++ b/src/Columns/ColumnTuple.h @@ -60,6 +60,7 @@ public: void insert(const Field & x) override; bool tryInsert(const Field & x) override; void insertFrom(const IColumn & src_, size_t n) override; + void insertManyFrom(const IColumn & src, size_t position, size_t length) override; void insertDefault() override; void popBack(size_t n) override; StringRef serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const override; From 3005bff23100539dbb71f9623dc3aed9c34a87f6 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Mar 2024 14:43:33 +0800 Subject: [PATCH 0102/1081] fix building --- src/Columns/ColumnArray.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 5b0df8e9b6b..389b3e97820 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -425,7 +425,7 @@ void ColumnArray::insertManyFromTuple(const ColumnArray & src, size_t position, if (tuple_size != src_tuple_size) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Nested tuple size mismatch: {} vs {}", tuple_size, src_tuple_size); - Columns temporary_arrays(tuple_size); + MutableColumns temporary_arrays(tuple_size); Columns src_temporary_arrays(tuple_size); for (size_t i = 0; i < tuple_size; ++i) { From 3dbb0a12fb433b29107d449099efbc99f5d71f34 Mon Sep 17 00:00:00 2001 From: unashi Date: Wed, 6 Mar 2024 16:15:37 +0800 Subject: [PATCH 0103/1081] [fix] style --- src/Storages/MergeTree/MergeTreeData.cpp | 7 +++---- tests/integration/helpers/cluster.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index d8680958c21..c76ffeee874 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7161,11 +7161,10 @@ std::pair MergeTreeData::cloneAn try { auto reservation_space = src_part_storage->reserve(src_part->getBytesOnDisk()); - if (!reservation_space) { + if (!reservation_space) throw Exception(ErrorCodes::NOT_ENOUGH_SPACE, "Not enough space on disk."); - } - dst_part_storage - = src_part_storage->clonePart(this->getRelativeDataPath(), tmp_dst_part_name, disk, read_settings, write_settings, {}, {}); + dst_part_storage = src_part_storage->clonePart( + this->getRelativeDataPath(), tmp_dst_part_name, disk, read_settings, write_settings, {}, {}); copy_successful = true; break; } diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 1d96563251b..767ba5b6660 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -3216,7 +3216,7 @@ services: - timeout:1 - inet6 - rotate - {networks} + {123} {app_net} {ipv4_address} {ipv6_address} From b4dba828a4dcde93944e05b512818827fd3e5a85 Mon Sep 17 00:00:00 2001 From: unashi Date: Wed, 6 Mar 2024 16:19:07 +0800 Subject: [PATCH 0104/1081] [fix] --- tests/integration/helpers/cluster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/helpers/cluster.py b/tests/integration/helpers/cluster.py index 767ba5b6660..1d96563251b 100644 --- a/tests/integration/helpers/cluster.py +++ b/tests/integration/helpers/cluster.py @@ -3216,7 +3216,7 @@ services: - timeout:1 - inet6 - rotate - {123} + {networks} {app_net} {ipv4_address} {ipv6_address} From 6d4514c045cc565919f9c8384710eee89354f0f3 Mon Sep 17 00:00:00 2001 From: kssenii Date: Wed, 6 Mar 2024 16:55:48 +0800 Subject: [PATCH 0105/1081] Fix test --- src/Storages/System/StorageSystemDisks.cpp | 10 +++++++++- tests/integration/test_backup_restore_s3/test.py | 12 ++++++------ 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index 30d64156b22..0f8a6640f2c 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -25,6 +25,8 @@ StorageSystemDisks::StorageSystemDisks(const StorageID & table_id_) {"unreserved_space", std::make_shared()}, {"keep_free_space", std::make_shared()}, {"type", std::make_shared()}, + {"object_storage_type", std::make_shared()}, + {"metadata_type", std::make_shared()}, {"is_encrypted", std::make_shared()}, {"is_read_only", std::make_shared()}, {"is_write_once", std::make_shared()}, @@ -53,6 +55,8 @@ Pipe StorageSystemDisks::read( MutableColumnPtr col_unreserved = ColumnUInt64::create(); MutableColumnPtr col_keep = ColumnUInt64::create(); MutableColumnPtr col_type = ColumnString::create(); + MutableColumnPtr col_object_storage_type = ColumnString::create(); + MutableColumnPtr col_metadata_type = ColumnString::create(); MutableColumnPtr col_is_encrypted = ColumnUInt8::create(); MutableColumnPtr col_is_read_only = ColumnUInt8::create(); MutableColumnPtr col_is_write_once = ColumnUInt8::create(); @@ -69,7 +73,9 @@ Pipe StorageSystemDisks::read( col_unreserved->insert(disk_ptr->getUnreservedSpace().value_or(std::numeric_limits::max())); col_keep->insert(disk_ptr->getKeepingFreeSpace()); auto data_source_description = disk_ptr->getDataSourceDescription(); - col_type->insert(data_source_description.toString()); + col_type->insert(data_source_description.type); + col_object_storage_type->insert(data_source_description.object_storage_type); + col_metadata_type->insert(data_source_description.metadata_type); col_is_encrypted->insert(data_source_description.is_encrypted); col_is_read_only->insert(disk_ptr->isReadOnly()); col_is_write_once->insert(disk_ptr->isWriteOnce()); @@ -91,6 +97,8 @@ Pipe StorageSystemDisks::read( res_columns.emplace_back(std::move(col_unreserved)); res_columns.emplace_back(std::move(col_keep)); res_columns.emplace_back(std::move(col_type)); + res_columns.emplace_back(std::move(col_object_storage_type)); + res_columns.emplace_back(std::move(col_metadata_type)); res_columns.emplace_back(std::move(col_is_encrypted)); res_columns.emplace_back(std::move(col_is_read_only)); res_columns.emplace_back(std::move(col_is_write_once)); diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 4d3ee8200a3..95e264107e4 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -124,15 +124,15 @@ def check_backup_and_restore( def check_system_tables(backup_query_id=None): disks = [ tuple(disk.split("\t")) - for disk in node.query("SELECT name, type FROM system.disks").split("\n") + for disk in node.query("SELECT name, type, object_storage_type, metadata_type FROM system.disks").split("\n") if disk ] expected_disks = ( - ("default", "local"), - ("disk_s3", "s3"), - ("disk_s3_cache", "s3"), - ("disk_s3_other_bucket", "s3"), - ("disk_s3_plain", "s3_plain"), + ("default", "local", "", ""), + ("disk_s3", "object_storage", "s3", "local"), + ("disk_s3_cache", "object_storage", "s3", "local"), + ("disk_s3_other_bucket", "object_storage", "s3", "local"), + ("disk_s3_plain", "object_storage", "s3", "plain"), ) assert len(expected_disks) == len(disks) for expected_disk in expected_disks: From be98c95f586762cdf20a6375917e30f296175593 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Wed, 6 Mar 2024 09:12:26 +0000 Subject: [PATCH 0106/1081] Automatic style fix --- tests/integration/test_backup_restore_s3/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 95e264107e4..452a9143067 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -124,7 +124,9 @@ def check_backup_and_restore( def check_system_tables(backup_query_id=None): disks = [ tuple(disk.split("\t")) - for disk in node.query("SELECT name, type, object_storage_type, metadata_type FROM system.disks").split("\n") + for disk in node.query( + "SELECT name, type, object_storage_type, metadata_type FROM system.disks" + ).split("\n") if disk ] expected_disks = ( From 8e413da8f156ab03c875b9525044265cffcc5b83 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 6 Mar 2024 17:32:08 +0800 Subject: [PATCH 0107/1081] apply opts for string nested in array --- src/Columns/ColumnArray.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 389b3e97820..44b17c89ae1 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -536,6 +536,8 @@ void ColumnArray::insertManyFromImpl(const ColumnArray & src, size_t position, s return insertManyFromNumber(src, position, length); if (typeid_cast *>(data.get())) return insertManyFromNumber(src, position, length); + if (typeid_cast(data.get())) + return insertManyFromString(src, position, length); if (typeid_cast(data.get())) return insertManyFromNullable(src, position, length); if (typeid_cast(data.get())) From 56fb61e1866e81e9a00b9b98299ddc56a54f5394 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 6 Mar 2024 10:53:39 +0000 Subject: [PATCH 0108/1081] Do not duplicate the first category in case of multiple categories in `FunctionDocumentation` --- src/Common/FunctionDocumentation.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Common/FunctionDocumentation.cpp b/src/Common/FunctionDocumentation.cpp index 2aad23b90b7..0dc5b48f9d1 100644 --- a/src/Common/FunctionDocumentation.cpp +++ b/src/Common/FunctionDocumentation.cpp @@ -36,6 +36,7 @@ std::string FunctionDocumentation::categoriesAsString() const auto it = categories.begin(); std::string res = *it; + ++it; for (; it != categories.end(); ++it) res += ", " + *it; return res; From 6f726865baf3fea606e7ff46e5d8cd98bda94f5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 6 Mar 2024 11:10:02 +0000 Subject: [PATCH 0109/1081] Add inline docs to functions --- src/Functions/sin.cpp | 10 +++++- src/Functions/visitParamExtractBool.cpp | 30 +++++++++++++++++- src/Functions/visitParamExtractFloat.cpp | 31 ++++++++++++++++++- src/Functions/visitParamExtractInt.cpp | 31 ++++++++++++++++++- src/Functions/visitParamExtractRaw.cpp | 30 +++++++++++++++++- src/Functions/visitParamExtractString.cpp | 30 +++++++++++++++++- src/Functions/visitParamExtractUInt.cpp | 31 ++++++++++++++++++- src/Functions/visitParamHas.cpp | 23 +++++++++++++- ...new_functions_must_be_documented.reference | 8 ----- 9 files changed, 208 insertions(+), 16 deletions(-) diff --git a/src/Functions/sin.cpp b/src/Functions/sin.cpp index dc75f4800c0..914f431adb4 100644 --- a/src/Functions/sin.cpp +++ b/src/Functions/sin.cpp @@ -13,7 +13,15 @@ using FunctionSin = FunctionMathUnary>; REGISTER_FUNCTION(Sin) { - factory.registerFunction({}, FunctionFactory::CaseInsensitive); + factory.registerFunction( + FunctionDocumentation{ + .description = "Returns the sine of the argument.", + .syntax = "sin(x)", + .arguments = {{"x", "The number whose sine will be returned. (U)Int*, Float* or Decimal*."}}, + .returned_value = "The sine of x.", + .examples = {{.name = "simple", .query = "SELECT sin(1.23)", .result = "0.9424888019316975"}}, + .categories{"Mathematical", "Trigonometric"}}, + FunctionFactory::CaseInsensitive); } } diff --git a/src/Functions/visitParamExtractBool.cpp b/src/Functions/visitParamExtractBool.cpp index 31763fe54ce..2c413ec13bb 100644 --- a/src/Functions/visitParamExtractBool.cpp +++ b/src/Functions/visitParamExtractBool.cpp @@ -21,7 +21,35 @@ using FunctionSimpleJSONExtractBool = FunctionsStringSearch(); + factory.registerFunction(FunctionDocumentation{ + .description = "Parses a true/false value from the value of the field named field_name. The result is UInt8.", + .syntax = "simpleJSONExtractBool(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value + = R"(It returns 1 if the value of the field is true, 0 otherwise. This means this function will return 0 including (and not only) in the following cases: + - If the field doesn't exists. + - If the field contains true as a string, e.g.: {"field":"true"}. + - If the field contains 1 as a numerical value.)", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":false,"bar":true}'); +INSERT INTO jsons VALUES ('{"foo":"true","qux":1}'); + +SELECT simpleJSONExtractBool(json, 'bar') FROM jsons ORDER BY json; +SELECT simpleJSONExtractBool(json, 'foo') FROM jsons ORDER BY json;)", + .result = R"(0 +1 +0 +0)"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamExtractBool", "simpleJSONExtractBool"); } diff --git a/src/Functions/visitParamExtractFloat.cpp b/src/Functions/visitParamExtractFloat.cpp index 6f6d5274050..fc839142cc7 100644 --- a/src/Functions/visitParamExtractFloat.cpp +++ b/src/Functions/visitParamExtractFloat.cpp @@ -11,7 +11,36 @@ using FunctionSimpleJSONExtractFloat = FunctionsStringSearch(); + factory.registerFunction(FunctionDocumentation{ + .description + = "Parses Float64 from the value of the field named field_name. If this is a string field, it tries to parse a number from the " + "beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns 0.", + .syntax = "simpleJSONExtractFloat(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value = "It returns the number parsed from the field if the field exists and contains a number, 0 otherwise.", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"-4e3"}'); +INSERT INTO jsons VALUES ('{"foo":-3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":"not1number"}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractFloat(json, 'foo') FROM jsons ORDER BY json;)", + .result = R"(0 +-4000 +0 +-3.4 +5)"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamExtractFloat", "simpleJSONExtractFloat"); } diff --git a/src/Functions/visitParamExtractInt.cpp b/src/Functions/visitParamExtractInt.cpp index e020c43e8b4..4588fc55c52 100644 --- a/src/Functions/visitParamExtractInt.cpp +++ b/src/Functions/visitParamExtractInt.cpp @@ -11,7 +11,36 @@ using FunctionSimpleJSONExtractInt = FunctionsStringSearch(); + factory.registerFunction(FunctionDocumentation{ + .description + = "Parses Int64 from the value of the field named field_name. If this is a string field, it tries to parse a number from the " + "beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns 0.", + .syntax = "simpleJSONExtractInt(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value = "It returns the number parsed from the field if the field exists and contains a number, 0 otherwise.", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"-4e3"}'); +INSERT INTO jsons VALUES ('{"foo":-3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":"not1number"}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractInt(json, 'foo') FROM jsons ORDER BY json;)", + .result = R"(0 +-4 +0 +-3 +5)"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamExtractInt", "simpleJSONExtractInt"); } diff --git a/src/Functions/visitParamExtractRaw.cpp b/src/Functions/visitParamExtractRaw.cpp index 74a83170545..296429423fe 100644 --- a/src/Functions/visitParamExtractRaw.cpp +++ b/src/Functions/visitParamExtractRaw.cpp @@ -61,7 +61,35 @@ using FunctionSimpleJSONExtractRaw = FunctionsStringSearchToString(); + factory.registerFunction(FunctionDocumentation{ + .description = "Returns the value of the field named field_name as a String, including separators.", + .syntax = "simpleJSONExtractRaw(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value + = "It returns the value of the field as a String including separators if the field exists, or an emtpy String otherwise.", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"-4e3"}'); +INSERT INTO jsons VALUES ('{"foo":-3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":{"def":[1,2,3]}}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractRaw(json, 'foo') FROM jsons ORDER BY json;)", + .result = R"( +"-4e3" +-3.4 +5 +{"def":[1,2,3]})"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamExtractRaw", "simpleJSONExtractRaw"); } diff --git a/src/Functions/visitParamExtractString.cpp b/src/Functions/visitParamExtractString.cpp index 50d5f345189..8dae10638f8 100644 --- a/src/Functions/visitParamExtractString.cpp +++ b/src/Functions/visitParamExtractString.cpp @@ -22,7 +22,35 @@ using FunctionSimpleJSONExtractString = FunctionsStringSearchToString(); + factory.registerFunction(FunctionDocumentation{ + .description = R"(Parses String in double quotes from the value of the field named field_name. + + There is currently no support for code points in the format \uXXXX\uYYYY that are not from the basic multilingual plane (they are converted to CESU-8 instead of UTF-8).)", + .syntax = "simpleJSONExtractString(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value = "It returns the value of a field as a String, including separators. The value is unescaped. It returns an empty " + "String: if the field doesn't contain a double quoted string, if unescaping fails or if the field doesn't exist.", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"\\n\\u0000"}'); +INSERT INTO jsons VALUES ('{"foo":"\\u263"}'); +INSERT INTO jsons VALUES ('{"foo":"\\u263a"}'); +INSERT INTO jsons VALUES ('{"foo":"hello}'); + +SELECT simpleJSONExtractString(json, 'foo') FROM jsons ORDER BY json;)", + .result = R"(\n\0 + +☺ +)"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamExtractString", "simpleJSONExtractString"); } diff --git a/src/Functions/visitParamExtractUInt.cpp b/src/Functions/visitParamExtractUInt.cpp index fb58e417f34..777df9fdd24 100644 --- a/src/Functions/visitParamExtractUInt.cpp +++ b/src/Functions/visitParamExtractUInt.cpp @@ -12,7 +12,36 @@ using FunctionSimpleJSONExtractUInt = FunctionsStringSearch(); + factory.registerFunction(FunctionDocumentation{ + .description + = "Parses UInt64 from the value of the field named field_name. If this is a string field, it tries to parse a number from the " + "beginning of the string. If the field does not exist, or it exists but does not contain a number, it returns 0.", + .syntax = "simpleJSONExtractUInt(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value = "It returns the number parsed from the field if the field exists and contains a number, 0 otherwise.", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"4e3"}'); +INSERT INTO jsons VALUES ('{"foo":3.4}'); +INSERT INTO jsons VALUES ('{"foo":5}'); +INSERT INTO jsons VALUES ('{"foo":"not1number"}'); +INSERT INTO jsons VALUES ('{"baz":2}'); + +SELECT simpleJSONExtractUInt(json, 'foo') FROM jsons ORDER BY json;)", + .result = R"(0 +4 +0 +3 +5)"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamExtractUInt", "simpleJSONExtractUInt"); } diff --git a/src/Functions/visitParamHas.cpp b/src/Functions/visitParamHas.cpp index 1ed1f1d16e7..09fec782980 100644 --- a/src/Functions/visitParamHas.cpp +++ b/src/Functions/visitParamHas.cpp @@ -21,7 +21,28 @@ using FunctionSimpleJSONHas = FunctionsStringSearch(); + factory.registerFunction(FunctionDocumentation{ + .description = "Checks whether there is a field named field_name. The result is UInt8.", + .syntax = "simpleJSONHas(json, field_name)", + .arguments + = {{"json", "The JSON in which the field is searched for. String."}, + {"field_name", "The name of the field to search for. String literal."}}, + .returned_value = "It returns 1 if the field exists, 0 otherwise.", + .examples + = {{.name = "simple", + .query = R"(CREATE TABLE jsons +( + json String +) +ENGINE = Memory; + +INSERT INTO jsons VALUES ('{"foo":"true","qux":1}'); + +SELECT simpleJSONHas(json, 'foo') FROM jsons; +SELECT simpleJSONHas(json, 'bar') FROM jsons;)", + .result = R"(1 +0)"}}, + .categories{"JSON"}}); factory.registerAlias("visitParamHas", "simpleJSONHas"); } diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index 379eea4dbbb..0a11e8b5034 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -643,14 +643,6 @@ shardNum showCertificate sigmoid sign -simpleJSONExtractBool -simpleJSONExtractFloat -simpleJSONExtractInt -simpleJSONExtractRaw -simpleJSONExtractString -simpleJSONExtractUInt -simpleJSONHas -sin sinh sipHash128 sipHash128Keyed From 2dc1721262c9f483917750aaa6139ff7409e02dc Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 6 Mar 2024 11:53:00 +0000 Subject: [PATCH 0110/1081] Refactorings for consistency --- src/Functions/array/arrayDistance.cpp | 54 +++++++-------- src/Functions/array/arrayDotProduct.cpp | 91 +++++++++++++------------ 2 files changed, 73 insertions(+), 72 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 71564f6fa93..6b72c99d829 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -379,17 +379,17 @@ public: } -#define SUPPORTED_TYPES(action) \ - action(UInt8) \ - action(UInt16) \ - action(UInt32) \ - action(UInt64) \ - action(Int8) \ - action(Int16) \ - action(Int32) \ - action(Int64) \ - action(Float32) \ - action(Float64) +#define SUPPORTED_TYPES(ACTION) \ + ACTION(UInt8) \ + ACTION(UInt16) \ + ACTION(UInt32) \ + ACTION(UInt64) \ + ACTION(Int8) \ + ACTION(Int16) \ + ACTION(Int32) \ + ACTION(Int64) \ + ACTION(Float32) \ + ACTION(Float64) private: @@ -398,12 +398,11 @@ private: { DataTypePtr type_x = typeid_cast(arguments[0].type.get())->getNestedType(); - /// Dynamic disaptch based on the 1st argument type switch (type_x->getTypeId()) { #define ON_TYPE(type) \ case TypeIndex::type: \ - return executeWithFirstType(arguments, input_rows_count); \ + return executeWithResultTypeAndLeftType(arguments, input_rows_count); \ break; SUPPORTED_TYPES(ON_TYPE) @@ -419,17 +418,16 @@ private: } } - template - ColumnPtr executeWithFirstType(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const + template + ColumnPtr executeWithResultTypeAndLeftType(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const { DataTypePtr type_y = typeid_cast(arguments[1].type.get())->getNestedType(); - /// Dynamic disaptch based on the 2nd argument type switch (type_y->getTypeId()) { #define ON_TYPE(type) \ case TypeIndex::type: \ - return executeWithTypes(arguments[0].column, arguments[1].column, input_rows_count, arguments); \ + return executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column, input_rows_count, arguments); \ break; SUPPORTED_TYPES(ON_TYPE) @@ -445,16 +443,16 @@ private: } } - template - ColumnPtr executeWithTypes(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count, const ColumnsWithTypeAndName & arguments) const + template + ColumnPtr executeWithResultTypeAndLeftTypeAndRightType(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count, const ColumnsWithTypeAndName & arguments) const { if (typeid_cast(col_x.get())) { - return executeWithTypesFirstArgConst(col_x, col_y, input_rows_count, arguments); + return executeWithLeftArgConst(col_x, col_y, input_rows_count, arguments); } else if (typeid_cast(col_y.get())) { - return executeWithTypesFirstArgConst(col_y, col_x, input_rows_count, arguments); + return executeWithLeftArgConst(col_y, col_x, input_rows_count, arguments); } col_x = col_x->convertToFullColumnIfConst(); @@ -463,8 +461,8 @@ private: const auto & array_x = *assert_cast(col_x.get()); const auto & array_y = *assert_cast(col_y.get()); - const auto & data_x = typeid_cast &>(array_x.getData()).getData(); - const auto & data_y = typeid_cast &>(array_y.getData()).getData(); + const auto & data_x = typeid_cast &>(array_x.getData()).getData(); + const auto & data_y = typeid_cast &>(array_y.getData()).getData(); const auto & offsets_x = array_x.getOffsets(); const auto & offsets_y = array_y.getOffsets(); @@ -521,8 +519,8 @@ private: } /// Special case when the 1st parameter is Const - template - ColumnPtr executeWithTypesFirstArgConst(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count, const ColumnsWithTypeAndName & arguments) const + template + ColumnPtr executeWithLeftArgConst(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count, const ColumnsWithTypeAndName & arguments) const { col_x = assert_cast(col_x.get())->getDataColumnPtr(); col_y = col_y->convertToFullColumnIfConst(); @@ -530,8 +528,8 @@ private: const auto & array_x = *assert_cast(col_x.get()); const auto & array_y = *assert_cast(col_y.get()); - const auto & data_x = typeid_cast &>(array_x.getData()).getData(); - const auto & data_y = typeid_cast &>(array_y.getData()).getData(); + const auto & data_x = typeid_cast &>(array_x.getData()).getData(); + const auto & data_y = typeid_cast &>(array_y.getData()).getData(); const auto & offsets_x = array_x.getOffsets(); const auto & offsets_y = array_y.getOffsets(); @@ -574,7 +572,7 @@ private: /// - the two most common metrics L2 and cosine distance, /// - the most powerful SIMD instruction set (AVX-512F). #if USE_MULTITARGET_CODE - if constexpr (std::is_same_v && std::is_same_v) /// ResultType is Float32 or Float64 + if constexpr (std::is_same_v && std::is_same_v) /// ResultType is Float32 or Float64 { if constexpr (std::is_same_v || std::is_same_v) diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index 6c615a058c3..548c79c567f 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -163,26 +163,29 @@ public: return Kernel::getReturnType(nested_types[0], nested_types[1]); } +#define SUPPORTED_TYPES(ACTION) \ + ACTION(UInt8) \ + ACTION(UInt16) \ + ACTION(UInt32) \ + ACTION(UInt64) \ + ACTION(Int8) \ + ACTION(Int16) \ + ACTION(Int32) \ + ACTION(Int64) \ + ACTION(Float32) \ + ACTION(Float64) + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /* input_rows_count */) const override { switch (result_type->getTypeId()) { - #define SUPPORTED_TYPE(type) \ + #define ON_TYPE(type) \ case TypeIndex::type: \ return executeWithResultType(arguments); \ break; - SUPPORTED_TYPE(UInt8) - SUPPORTED_TYPE(UInt16) - SUPPORTED_TYPE(UInt32) - SUPPORTED_TYPE(UInt64) - SUPPORTED_TYPE(Int8) - SUPPORTED_TYPE(Int16) - SUPPORTED_TYPE(Int32) - SUPPORTED_TYPE(Int64) - SUPPORTED_TYPE(Float32) - SUPPORTED_TYPE(Float64) - #undef SUPPORTED_TYPE + SUPPORTED_TYPES(ON_TYPE) + #undef ON_TYPE default: throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected result type {}", result_type->getName()); @@ -194,16 +197,16 @@ private: ColumnPtr executeWithResultType(const ColumnsWithTypeAndName & arguments) const { ColumnPtr res; - if (!((res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)) - || (res = executeWithResultTypeAndLeft(arguments)))) + if (!((res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)) + || (res = executeWithResultTypeAndLeftType(arguments)))) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); @@ -211,43 +214,43 @@ private: } template - ColumnPtr executeWithResultTypeAndLeft(const ColumnsWithTypeAndName & arguments) const + ColumnPtr executeWithResultTypeAndLeftType(const ColumnsWithTypeAndName & arguments) const { ColumnPtr res; - if ( (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments)) - || (res = executeWithResultTypeAndLeftAndRight(arguments))) + if ( (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) + || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column))) return res; return nullptr; } template - ColumnPtr executeWithResultTypeAndLeftAndRight(const ColumnsWithTypeAndName & arguments) const + ColumnPtr executeWithResultTypeAndLeftTypeAndRightType(ColumnPtr col_x, ColumnPtr col_y) const { - ColumnPtr col_left = arguments[0].column->convertToFullColumnIfConst(); - ColumnPtr col_right = arguments[1].column->convertToFullColumnIfConst(); - if (!col_left || !col_right) + col_x = col_x->convertToFullColumnIfConst(); + col_y = col_y->convertToFullColumnIfConst(); + if (!col_x || !col_y) return nullptr; - const ColumnArray * col_arr_left = checkAndGetColumn(col_left.get()); - const ColumnArray * cokl_arr_right = checkAndGetColumn(col_right.get()); - if (!col_arr_left || !cokl_arr_right) + const ColumnArray * array_x = checkAndGetColumn(col_x.get()); + const ColumnArray * array_y = checkAndGetColumn(col_y.get()); + if (!array_x || !array_y) return nullptr; - const ColumnVector * col_arr_nested_left = checkAndGetColumn>(col_arr_left->getData()); - const ColumnVector * col_arr_nested_right = checkAndGetColumn>(cokl_arr_right->getData()); + const ColumnVector * col_arr_nested_left = checkAndGetColumn>(array_x->getData()); + const ColumnVector * col_arr_nested_right = checkAndGetColumn>(array_y->getData()); if (!col_arr_nested_left || !col_arr_nested_right) return nullptr; - if (!col_arr_left->hasEqualOffsets(*cokl_arr_right)) + if (!array_x->hasEqualOffsets(*array_y)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Array arguments for function {} must have equal sizes", getName()); auto col_res = ColumnVector::create(); @@ -255,7 +258,7 @@ private: vector( col_arr_nested_left->getData(), col_arr_nested_right->getData(), - col_arr_left->getOffsets(), + array_x->getOffsets(), col_res->getData()); return col_res; From 076482e8bd503ca352695173d87e9b48228389f0 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 6 Mar 2024 14:04:09 +0100 Subject: [PATCH 0111/1081] Remove whitespaces --- src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp index 42c6e63da01..e0d0fda81cb 100644 --- a/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp +++ b/src/Interpreters/OptimizeShardingKeyRewriteInVisitor.cpp @@ -132,7 +132,7 @@ class OptimizeShardingKeyRewriteIn : public InDepthQueryTreeVisitorWithContext; - + OptimizeShardingKeyRewriteIn(OptimizeShardingKeyRewriteInVisitor::Data data_, ContextPtr context) : Base(std::move(context)) , data(std::move(data_)) From 7065e650e1d007be4659ddb1f070b48e19cdef55 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 6 Mar 2024 14:34:03 +0100 Subject: [PATCH 0112/1081] Code cleanup --- src/DataTypes/ObjectUtils.cpp | 24 ++------ src/DataTypes/ObjectUtils.h | 2 - .../ClusterProxy/SelectStreamFactory.cpp | 7 +-- .../ClusterProxy/SelectStreamFactory.h | 2 +- src/Processors/QueryPlan/ReadFromRemote.cpp | 61 +++---------------- src/Storages/StorageDistributed.cpp | 1 - tests/analyzer_integration_broken_tests.txt | 1 - .../test_distributed_type_object/test.py | 3 +- 8 files changed, 18 insertions(+), 83 deletions(-) diff --git a/src/DataTypes/ObjectUtils.cpp b/src/DataTypes/ObjectUtils.cpp index 47d8c5c9113..ccfa0a28f13 100644 --- a/src/DataTypes/ObjectUtils.cpp +++ b/src/DataTypes/ObjectUtils.cpp @@ -1,4 +1,8 @@ #include +#include +#include +#include +#include #include #include #include @@ -21,16 +25,6 @@ #include #include #include -#include "Analyzer/ConstantNode.h" -#include "Analyzer/FunctionNode.h" -#include "Analyzer/IQueryTreeNode.h" -#include "Analyzer/Identifier.h" -#include "Analyzer/IdentifierNode.h" -#include "Analyzer/QueryNode.h" -#include "Analyzer/Utils.h" -#include -#include -#include "Common/logger_useful.h" namespace DB @@ -991,22 +985,12 @@ MissingObjectList replaceMissedSubcolumnsByConstants( { auto constant = std::make_shared(type->getDefault(), type); constant->setAlias(table_expression->getAlias() + "." + name); - // auto materialize = std::make_shared("materialize"); - - // auto function = FunctionFactory::instance().get("materialize", context); - // materialize->getArguments().getNodes() = { constant }; - // materialize->resolveAsFunction(function->build(materialize->getArgumentColumns())); - // materialize->setAlias(name); column_name_to_node[name] = buildCastFunction(constant, type, context); missed_list.push_back({ constant->getValueStringRepresentation() + "_" + constant->getResultType()->getName(), table_expression->getAlias() + "." + name }); - LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "{} -> {}", missed_list.back().first, missed_list.back().second); - LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Name {} Expression\n{}", name, column_name_to_node[name]->dumpTree()); } - LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Table expression\n{} ", table_expression->dumpTree()); replaceColumns(query, table_expression, column_name_to_node); - LOG_DEBUG(&Poco::Logger::get("replaceMissedSubcolumnsByConstants"), "Result:\n{} ", query->dumpTree()); return missed_list; } diff --git a/src/DataTypes/ObjectUtils.h b/src/DataTypes/ObjectUtils.h index 013e525832e..6ef19baf5ae 100644 --- a/src/DataTypes/ObjectUtils.h +++ b/src/DataTypes/ObjectUtils.h @@ -3,8 +3,6 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" -#include "Interpreters/Context_fwd.h" #include #include #include diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp index 5bcd1ce68cb..4fccd83c8c0 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.cpp @@ -5,10 +5,9 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" -#include "Interpreters/InterpreterSelectQueryAnalyzer.h" -#include "Interpreters/SelectQueryOptions.h" -#include "Planner/Utils.h" +#include +#include +#include #include #include #include diff --git a/src/Interpreters/ClusterProxy/SelectStreamFactory.h b/src/Interpreters/ClusterProxy/SelectStreamFactory.h index bee7edb3c19..61694830b3d 100644 --- a/src/Interpreters/ClusterProxy/SelectStreamFactory.h +++ b/src/Interpreters/ClusterProxy/SelectStreamFactory.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -8,7 +9,6 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" namespace DB { diff --git a/src/Processors/QueryPlan/ReadFromRemote.cpp b/src/Processors/QueryPlan/ReadFromRemote.cpp index ac507c6d555..72848a37f6e 100644 --- a/src/Processors/QueryPlan/ReadFromRemote.cpp +++ b/src/Processors/QueryPlan/ReadFromRemote.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -15,7 +14,6 @@ #include #include #include -#include "DataTypes/ObjectUtils.h" #include #include #include @@ -33,54 +31,14 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -static void addRenamingActions(Pipe & pipe, const MissingObjectList & missed_list, const Block & output_header) -{ - if (missed_list.empty()) - return; - - const auto & output_columns = output_header.getColumnsWithTypeAndName(); - std::vector indexes; - for (size_t i = 0; i < output_columns.size(); ++i) - { - bool found = false; - for (auto const & elem : missed_list) - { - if (output_columns[i].name.contains(elem.second)) - { - found = true; - break; - } - } - if (found) - indexes.push_back(i); - } - - auto dag = std::make_shared(pipe.getHeader().getColumnsWithTypeAndName()); - - for (size_t index : indexes) - { - dag->addOrReplaceInOutputs(dag->addAlias(*dag->getOutputs()[index], output_header.getByPosition(index).name)); - } - - // dag->addAliases(rename_to_apply); - - auto convert_actions = std::make_shared(dag); - pipe.addSimpleTransform([&](const Block & cur_header, Pipe::StreamType) -> ProcessorPtr - { - return std::make_shared(cur_header, convert_actions); - }); - - LOG_DEBUG(&Poco::Logger::get("addRenamingActions"), "EXPECTED:\n{}", output_header.dumpStructure()); - - LOG_DEBUG(&Poco::Logger::get("addRenamingActions"), "{}", pipe.getHeader().dumpStructure()); -} - -static void addConvertingActions(Pipe & pipe, const Block & header) +static void addConvertingActions(Pipe & pipe, const Block & header, bool use_positions_to_match = false) { if (blocksHaveEqualStructure(pipe.getHeader(), header)) return; - auto get_converting_dag = [](const Block & block_, const Block & header_) + auto match_mode = use_positions_to_match ? ActionsDAG::MatchColumnsMode::Position : ActionsDAG::MatchColumnsMode::Name; + + auto get_converting_dag = [mode = match_mode](const Block & block_, const Block & header_) { /// Convert header structure to expected. /// Also we ignore constants from result and replace it with constants from header. @@ -88,7 +46,7 @@ static void addConvertingActions(Pipe & pipe, const Block & header) return ActionsDAG::makeConvertingActions( block_.getColumnsWithTypeAndName(), header_.getColumnsWithTypeAndName(), - ActionsDAG::MatchColumnsMode::Name, + mode, true); }; @@ -260,8 +218,7 @@ void ReadFromRemote::addLazyPipe(Pipes & pipes, const ClusterProxy::SelectStream }; pipes.emplace_back(createDelayedPipe(shard.header, lazily_create_stream, add_totals, add_extremes)); - addRenamingActions(pipes.back(), shard.missing_object_list, output_stream->header); - addConvertingActions(pipes.back(), output_stream->header); + addConvertingActions(pipes.back(), output_stream->header, !shard.missing_object_list.empty()); } void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFactory::Shard & shard) @@ -342,8 +299,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); - addRenamingActions(pipes.back(), shard.missing_object_list, output_stream->header); - addConvertingActions(pipes.back(), output_stream->header); + addConvertingActions(pipes.back(), output_stream->header, !shard.missing_object_list.empty()); } } else @@ -372,8 +328,7 @@ void ReadFromRemote::addPipe(Pipes & pipes, const ClusterProxy::SelectStreamFact pipes.emplace_back( createRemoteSourcePipe(remote_query_executor, add_agg_info, add_totals, add_extremes, async_read, async_query_sending)); - addRenamingActions(pipes.back(), shard.missing_object_list, output_stream->header); - addConvertingActions(pipes.back(), output_stream->header); + addConvertingActions(pipes.back(), output_stream->header, !shard.missing_object_list.empty()); } } diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 34ab21a4751..726f1788115 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,7 +30,6 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" #include #include diff --git a/tests/analyzer_integration_broken_tests.txt b/tests/analyzer_integration_broken_tests.txt index 796ca6bca22..a7954f91efa 100644 --- a/tests/analyzer_integration_broken_tests.txt +++ b/tests/analyzer_integration_broken_tests.txt @@ -1,6 +1,5 @@ test_build_sets_from_multiple_threads/test.py::test_set test_concurrent_backups_s3/test.py::test_concurrent_backups -test_distributed_type_object/test.py::test_distributed_type_object test_merge_table_over_distributed/test.py::test_global_in test_merge_table_over_distributed/test.py::test_select_table_name_from_merge_over_distributed test_passing_max_partitions_to_read_remotely/test.py::test_default_database_on_cluster diff --git a/tests/integration/test_distributed_type_object/test.py b/tests/integration/test_distributed_type_object/test.py index 7e6c000cb8e..360087c9dda 100644 --- a/tests/integration/test_distributed_type_object/test.py +++ b/tests/integration/test_distributed_type_object/test.py @@ -85,10 +85,11 @@ def test_distributed_type_object(started_cluster): 3\t\t\t\tfoo""" ) + # The following query is not supported by analyzer now assert ( TSV( node1.query( - "SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM dist_table ORDER BY id" + "SELECT id, data.k1, data.k2.k3, data.k2.k4, data.k5 FROM dist_table ORDER BY id SETTINGS allow_experimental_analyzer = 0" ) ) == expected From c3909743ed95adf5efd16e69f353aab8af73978b Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 6 Mar 2024 12:36:00 +0000 Subject: [PATCH 0113/1081] Remove repeated unnecessary unpacking of const columns --- src/Functions/array/arrayDotProduct.cpp | 88 +++++++++++++------------ 1 file changed, 46 insertions(+), 42 deletions(-) diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index 548c79c567f..c27170cd35b 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -19,7 +19,6 @@ namespace DB namespace ErrorCodes { extern const int BAD_ARGUMENTS; - extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int LOGICAL_ERROR; } @@ -196,40 +195,51 @@ private: template ColumnPtr executeWithResultType(const ColumnsWithTypeAndName & arguments) const { - ColumnPtr res; - if (!((res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)) - || (res = executeWithResultTypeAndLeftType(arguments)))) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of first argument of function {}", arguments[0].column->getName(), getName()); + DataTypePtr type_x = typeid_cast(arguments[0].type.get())->getNestedType(); - return res; + switch (type_x->getTypeId()) + { +#define ON_TYPE(type) \ + case TypeIndex::type: \ + return executeWithResultTypeAndLeftType(arguments); \ + break; + + SUPPORTED_TYPES(ON_TYPE) +#undef ON_TYPE + + default: + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Arguments of function {} has nested type {}. " + "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + getName(), + type_x->getName()); + } } template ColumnPtr executeWithResultTypeAndLeftType(const ColumnsWithTypeAndName & arguments) const { - ColumnPtr res; - if ( (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column)) - || (res = executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column))) - return res; + DataTypePtr type_y = typeid_cast(arguments[1].type.get())->getNestedType(); - return nullptr; + switch (type_y->getTypeId()) + { + #define ON_TYPE(type) \ + case TypeIndex::type: \ + return executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column); \ + break; + + SUPPORTED_TYPES(ON_TYPE) + #undef ON_TYPE + + default: + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Arguments of function {} has nested type {}. " + "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + getName(), + type_y->getName()); + } } template @@ -237,28 +247,22 @@ private: { col_x = col_x->convertToFullColumnIfConst(); col_y = col_y->convertToFullColumnIfConst(); - if (!col_x || !col_y) - return nullptr; - const ColumnArray * array_x = checkAndGetColumn(col_x.get()); - const ColumnArray * array_y = checkAndGetColumn(col_y.get()); - if (!array_x || !array_y) - return nullptr; + const auto & array_x = *assert_cast(col_x.get()); + const auto & array_y = *assert_cast(col_y.get()); - const ColumnVector * col_arr_nested_left = checkAndGetColumn>(array_x->getData()); - const ColumnVector * col_arr_nested_right = checkAndGetColumn>(array_y->getData()); - if (!col_arr_nested_left || !col_arr_nested_right) - return nullptr; + const auto & data_x = typeid_cast &>(array_x.getData()).getData(); + const auto & data_y = typeid_cast &>(array_y.getData()).getData(); - if (!array_x->hasEqualOffsets(*array_y)) + if (!array_x.hasEqualOffsets(array_y)) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Array arguments for function {} must have equal sizes", getName()); auto col_res = ColumnVector::create(); vector( - col_arr_nested_left->getData(), - col_arr_nested_right->getData(), - array_x->getOffsets(), + data_x, + data_y, + array_x.getOffsets(), col_res->getData()); return col_res; From 5b94f9b4115e3b7e03118b4a4f4999139e58511e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Wed, 6 Mar 2024 15:31:19 +0100 Subject: [PATCH 0114/1081] Check children first --- src/Storages/VirtualColumnUtils.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 6d66453442e..e8441b96782 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -469,18 +469,18 @@ static bool canEvaluateSubtree(const ActionsDAG::Node * node, const Block & allo static bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node) { - if (node->type != ActionsDAG::ActionType::FUNCTION) - return true; - - if (!node->function_base->isDeterministicInScopeOfQuery()) - return false; - for (const auto * child : node->children) { if (!isDeterministicInScopeOfQuery(child)) return false; } + if (node->type != ActionsDAG::ActionType::FUNCTION) + return true; + + if (!node->function_base->isDeterministicInScopeOfQuery()) + return false; + return true; } From a69bcc29f5bb0bdaca1757673dac1574c97b1e2f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 6 Mar 2024 14:53:11 +0000 Subject: [PATCH 0115/1081] Refactor a sanity check --- src/Functions/array/arrayDistance.cpp | 20 +++---------------- src/Functions/array/arrayDotProduct.cpp | 8 +++++--- .../queries/0_stateless/02708_dotProduct.sql | 2 +- 3 files changed, 9 insertions(+), 21 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 6b72c99d829..c9e6e97749f 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -18,11 +18,11 @@ namespace DB { namespace ErrorCodes { + extern const int ARGUMENT_OUT_OF_BOUND; extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int LOGICAL_ERROR; extern const int SIZES_OF_ARRAYS_DONT_MATCH; - extern const int ARGUMENT_OUT_OF_BOUND; } struct L1Distance @@ -465,22 +465,9 @@ private: const auto & data_y = typeid_cast &>(array_y.getData()).getData(); const auto & offsets_x = array_x.getOffsets(); - const auto & offsets_y = array_y.getOffsets(); - /// Check that arrays in both columns are the sames size - for (size_t row = 0; row < offsets_x.size(); ++row) - { - if (offsets_x[row] != offsets_y[row]) [[unlikely]] - { - ColumnArray::Offset prev_offset = row > 0 ? offsets_x[row] : 0; - throw Exception( - ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, - "Arguments of function {} have different array sizes: {} and {}", - getName(), - offsets_x[row] - prev_offset, - offsets_y[row] - prev_offset); - } - } + if (!array_x.hasEqualOffsets(array_y)) + throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Array arguments for function {} must have equal sizes", getName()); const typename Kernel::ConstParams kernel_params = initConstParams(arguments); @@ -534,7 +521,6 @@ private: const auto & offsets_x = array_x.getOffsets(); const auto & offsets_y = array_y.getOffsets(); - /// Check that arrays in both columns are the sames size ColumnArray::Offset prev_offset = 0; for (size_t row : collections::range(0, offsets_y.size())) { diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index c27170cd35b..3abd1a6c6db 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -18,9 +18,9 @@ namespace DB namespace ErrorCodes { - extern const int BAD_ARGUMENTS; extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int LOGICAL_ERROR; + extern const int SIZES_OF_ARRAYS_DONT_MATCH; } @@ -254,15 +254,17 @@ private: const auto & data_x = typeid_cast &>(array_x.getData()).getData(); const auto & data_y = typeid_cast &>(array_y.getData()).getData(); + const auto & offsets_x = array_x.getOffsets(); + if (!array_x.hasEqualOffsets(array_y)) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Array arguments for function {} must have equal sizes", getName()); + throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Array arguments for function {} must have equal sizes", getName()); auto col_res = ColumnVector::create(); vector( data_x, data_y, - array_x.getOffsets(), + offsets_x, col_res->getData()); return col_res; diff --git a/tests/queries/0_stateless/02708_dotProduct.sql b/tests/queries/0_stateless/02708_dotProduct.sql index 6ad615664e8..2035e23cf1d 100644 --- a/tests/queries/0_stateless/02708_dotProduct.sql +++ b/tests/queries/0_stateless/02708_dotProduct.sql @@ -4,7 +4,7 @@ SELECT arrayDotProduct([1, 2]); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATC SELECT arrayDotProduct([1, 2], 'abc'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayDotProduct('abc', [1, 2]); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT arrayDotProduct([1, 2], ['abc', 'def']); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } -SELECT arrayDotProduct([1, 2], [3, 4, 5]); -- { serverError BAD_ARGUMENTS } +SELECT arrayDotProduct([1, 2], [3, 4, 5]); -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } SELECT dotProduct([1, 2], (3, 4, 5)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } SELECT '-- Tests'; From 526f162082dfbb4ad2fb5d3d807dfd2ad9b54bdd Mon Sep 17 00:00:00 2001 From: avogar Date: Thu, 29 Feb 2024 18:20:47 +0000 Subject: [PATCH 0116/1081] Fix logical error on bad compatibility setting value type --- src/Core/Settings.cpp | 4 ++++ .../03003_compatibility_setting_bad_value.reference | 0 .../0_stateless/03003_compatibility_setting_bad_value.sql | 2 ++ 3 files changed, 6 insertions(+) create mode 100644 tests/queries/0_stateless/03003_compatibility_setting_bad_value.reference create mode 100644 tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index a38197b9eeb..fb456b46d89 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -114,7 +114,11 @@ std::vector Settings::getAllRegisteredNames() const void Settings::set(std::string_view name, const Field & value) { if (name == "compatibility") + { + if (value.getType() != Field::Types::Which::String) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Unexpected type of value for setting 'compatibility'. Expected String, got {}", value.getTypeName()); applyCompatibilitySetting(value.get()); + } /// If we change setting that was changed by compatibility setting before /// we should remove it from settings_changed_by_compatibility_setting, /// otherwise the next time we will change compatibility setting diff --git a/tests/queries/0_stateless/03003_compatibility_setting_bad_value.reference b/tests/queries/0_stateless/03003_compatibility_setting_bad_value.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql b/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql new file mode 100644 index 00000000000..9a6f4e7944a --- /dev/null +++ b/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql @@ -0,0 +1,2 @@ +select 42 settings compatibility=NULL; -- {clientError BAD_GET} + From bdb76d9dd4b42ab4f40db0d371165665171afb4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 6 Mar 2024 16:30:22 +0000 Subject: [PATCH 0117/1081] Fix aspell errors --- docs/en/sql-reference/functions/json-functions.md | 2 +- utils/check-style/aspell-ignore/en/aspell-dict.txt | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/en/sql-reference/functions/json-functions.md b/docs/en/sql-reference/functions/json-functions.md index 246cb8972fb..e920ab82988 100644 --- a/docs/en/sql-reference/functions/json-functions.md +++ b/docs/en/sql-reference/functions/json-functions.md @@ -264,7 +264,7 @@ simpleJSONExtractRaw(json, field_name) **Returned value** -It returns the value of the field as a [`String`](../../sql-reference/data-types/string.md#string), including separators if the field exists, or an emtpy `String` otherwise. +It returns the value of the field as a [`String`](../../sql-reference/data-types/string.md#string), including separators if the field exists, or an empty `String` otherwise. **Example** diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 3614bcb7452..917b2cdcc71 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -1,4 +1,4 @@ -personal_ws-1.1 en 2724 +personal_ws-1.1 en 2758 AArch ACLs ALTERs @@ -843,7 +843,6 @@ SendScalars ShareAlike SharedMergeTree Shortkeys -Shortkeys SimHash Simhash SimpleAggregateFunction @@ -1703,7 +1702,6 @@ hyperscan hypot hyvor iTerm -iTerm icosahedron icudata idempotency @@ -2327,6 +2325,14 @@ shortcircuit shortkeys shoutout simdjson +simpleJSON +simpleJSONExtractBool +simpleJSONExtractFloat +simpleJSONExtractInt +simpleJSONExtractRaw +simpleJSONExtractString +simpleJSONExtractUInt +simpleJSONHas simpleLinearRegression simpleaggregatefunction simplelinearregression From 77a980373a1dab7c49e5713ba7050d218c1250c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Wed, 6 Mar 2024 16:31:27 +0000 Subject: [PATCH 0118/1081] Fix typo in inline doc --- src/Functions/visitParamExtractRaw.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/visitParamExtractRaw.cpp b/src/Functions/visitParamExtractRaw.cpp index 296429423fe..3cdc5001e13 100644 --- a/src/Functions/visitParamExtractRaw.cpp +++ b/src/Functions/visitParamExtractRaw.cpp @@ -68,7 +68,7 @@ REGISTER_FUNCTION(VisitParamExtractRaw) = {{"json", "The JSON in which the field is searched for. String."}, {"field_name", "The name of the field to search for. String literal."}}, .returned_value - = "It returns the value of the field as a String including separators if the field exists, or an emtpy String otherwise.", + = "It returns the value of the field as a String including separators if the field exists, or an empty String otherwise.", .examples = {{.name = "simple", .query = R"(CREATE TABLE jsons From d9b5f9a086d8bc1af5387edee77d0e3fdbf4d9b0 Mon Sep 17 00:00:00 2001 From: Nikolay Monkov Date: Wed, 6 Mar 2024 21:37:56 +0500 Subject: [PATCH 0119/1081] version has been added to docker labels --- tests/ci/docker_server.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/ci/docker_server.py b/tests/ci/docker_server.py index 38d0ea6d86b..803dbfcd92a 100644 --- a/tests/ci/docker_server.py +++ b/tests/ci/docker_server.py @@ -216,11 +216,12 @@ def gen_tags(version: ClickHouseVersion, release_type: str) -> List[str]: return tags -def buildx_args(urls: Dict[str, str], arch: str, direct_urls: List[str]) -> List[str]: +def buildx_args(urls: Dict[str, str], arch: str, direct_urls: List[str], version: str) -> List[str]: args = [ f"--platform=linux/{arch}", f"--label=build-url={GITHUB_RUN_URL}", f"--label=com.clickhouse.build.githash={git.sha}", + f"--label=com.clickhouse.build.version={version}", ] if direct_urls: args.append(f"--build-arg=DIRECT_DOWNLOAD_URLS='{' '.join(direct_urls)}'") @@ -267,7 +268,7 @@ def build_and_push_image( urls = [url for url in direct_urls[arch] if ".deb" in url] else: urls = [url for url in direct_urls[arch] if ".tgz" in url] - cmd_args.extend(buildx_args(repo_urls, arch, direct_urls=urls)) + cmd_args.extend(buildx_args(repo_urls, arch, direct_urls=urls, version=version.describe)) if not push: cmd_args.append(f"--tag={image.repo}:{arch_tag}") cmd_args.extend( From 245ea0c186614a5a13a7e0bed79f94bc6ad46d87 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Wed, 6 Mar 2024 15:26:11 +0000 Subject: [PATCH 0120/1081] Implement const/non-const shortcut --- src/Functions/array/arrayDistance.cpp | 3 +- src/Functions/array/arrayDotProduct.cpp | 110 ++++++++++++++---- tests/performance/dotProduct.xml | 1 + .../0_stateless/02708_dotProduct.reference | 7 ++ .../queries/0_stateless/02708_dotProduct.sql | 9 ++ 5 files changed, 106 insertions(+), 24 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index c9e6e97749f..03f0bc7b286 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -477,6 +477,7 @@ private: /// Do the actual computation ColumnArray::Offset prev = 0; size_t row = 0; + for (auto off : offsets_x) { /// Process chunks in vectorized manner @@ -522,7 +523,7 @@ private: const auto & offsets_y = array_y.getOffsets(); ColumnArray::Offset prev_offset = 0; - for (size_t row : collections::range(0, offsets_y.size())) + for (size_t row = 0; row < offsets_y.size(); ++row) { if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] { diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index 3abd1a6c6db..f9a6687e028 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -245,6 +245,15 @@ private: template ColumnPtr executeWithResultTypeAndLeftTypeAndRightType(ColumnPtr col_x, ColumnPtr col_y) const { + if (typeid_cast(col_x.get())) + { + return executeWithLeftArgConst(col_x, col_y); + } + else if (typeid_cast(col_y.get())) + { + return executeWithLeftArgConst(col_y, col_x); + } + col_x = col_x->convertToFullColumnIfConst(); col_y = col_y->convertToFullColumnIfConst(); @@ -260,30 +269,83 @@ private: throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Array arguments for function {} must have equal sizes", getName()); auto col_res = ColumnVector::create(); + auto & result = col_res->getData(); - vector( - data_x, - data_y, - offsets_x, - col_res->getData()); - - return col_res; - } - - template - static void vector( - const PaddedPODArray & left, - const PaddedPODArray & right, - const ColumnArray::Offsets & offsets, - PaddedPODArray & result) - { - size_t size = offsets.size(); + size_t size = offsets_x.size(); result.resize(size); ColumnArray::Offset current_offset = 0; for (size_t row = 0; row < size; ++row) { - size_t array_size = offsets[row] - current_offset; + size_t array_size = offsets_x[row] - current_offset; + + size_t i = 0; + + /// Process chunks in vectorized manner + static constexpr size_t VEC_SIZE = 4; + typename Kernel::template State states[VEC_SIZE]; + for (; i + VEC_SIZE < array_size; i += VEC_SIZE) + { + for (size_t j = 0; j < VEC_SIZE; ++j) + Kernel::template accumulate(states[j], static_cast(data_x[current_offset + i + j]), static_cast(data_y[current_offset + i + j])); + } + + typename Kernel::template State state; + for (const auto & other_state : states) + Kernel::template combine(state, other_state); + + /// Process the tail + for (; i < array_size; ++i) + Kernel::template accumulate(state, static_cast(data_x[current_offset + i]), static_cast(data_y[current_offset + i])); + + result[row] = Kernel::template finalize(state); + + current_offset = offsets_x[row]; + } + + return col_res; + } + + template + ColumnPtr executeWithLeftArgConst(ColumnPtr col_x, ColumnPtr col_y) const + { + col_x = assert_cast(col_x.get())->getDataColumnPtr(); + col_y = col_y->convertToFullColumnIfConst(); + + const auto & array_x = *assert_cast(col_x.get()); + const auto & array_y = *assert_cast(col_y.get()); + + const auto & data_x = typeid_cast &>(array_x.getData()).getData(); + const auto & data_y = typeid_cast &>(array_y.getData()).getData(); + + const auto & offsets_x = array_x.getOffsets(); + const auto & offsets_y = array_y.getOffsets(); + + ColumnArray::Offset prev_offset = 0; + for (size_t row = 0; row < offsets_y.size(); ++row) + { + if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] + { + throw Exception( + ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, + "Arguments of function {} have different array sizes: {} and {}", + getName(), + offsets_x[0], + offsets_y[row] - prev_offset); + } + prev_offset = offsets_y[row]; + } + + auto col_res = ColumnVector::create(); + auto & result = col_res->getData(); + + size_t size = offsets_y.size(); + result.resize(size); + + ColumnArray::Offset current_offset = 0; + for (size_t row = 0; row < size; ++row) + { + size_t array_size = offsets_x[0]; typename Kernel::template State state; size_t i = 0; @@ -292,13 +354,14 @@ private: /// To avoid combinatorial explosion of SIMD kernels, focus on /// - the two most common input/output types (Float32 x Float32) --> Float32 and (Float64 x Float64) --> Float64 instead of 10 x /// 10 input types x 8 output types, + /// - const/non-const inputs instead of non-const/non-const inputs /// - the most powerful SIMD instruction set (AVX-512F). #if USE_MULTITARGET_CODE if constexpr ((std::is_same_v || std::is_same_v) && std::is_same_v && std::is_same_v) { if (isArchSupported(TargetArch::AVX512F)) - Kernel::template accumulateCombine(&left[current_offset], &right[current_offset], array_size, i, state); + Kernel::template accumulateCombine(&data_x[0], &data_y[current_offset], array_size, i, state); } #else /// Process chunks in vectorized manner @@ -307,7 +370,7 @@ private: for (; i + VEC_SIZE < array_size; i += VEC_SIZE) { for (size_t j = 0; j < VEC_SIZE; ++j) - Kernel::template accumulate(states[j], static_cast(left[i + j]), static_cast(right[i + j])); + Kernel::template accumulate(states[j], static_cast(data_x[i + j]), static_cast(data_y[current_offset + i + j])); } for (const auto & other_state : states) @@ -316,13 +379,14 @@ private: /// Process the tail for (; i < array_size; ++i) - Kernel::template accumulate(state, static_cast(left[i]), static_cast(right[i])); + Kernel::template accumulate(state, static_cast(data_x[i]), static_cast(data_y[current_offset + i])); - /// ResultType res = Kernel::template finalize(state); result[row] = Kernel::template finalize(state); - current_offset = offsets[row]; + current_offset = offsets_y[row]; } + + return col_res; } }; diff --git a/tests/performance/dotProduct.xml b/tests/performance/dotProduct.xml index 6e056964ebb..a0ab7beea9c 100644 --- a/tests/performance/dotProduct.xml +++ b/tests/performance/dotProduct.xml @@ -56,6 +56,7 @@ SELECT sum(dp) FROM (SELECT dotProduct(v, v) AS dp FROM vecs_{element_type}) + WITH (SELECT v FROM vecs_{element_type} limit 1) AS a SELECT sum(dp) FROM (SELECT dotProduct(a, v) AS dp FROM vecs_{element_type}) DROP TABLE vecs_{element_type} diff --git a/tests/queries/0_stateless/02708_dotProduct.reference b/tests/queries/0_stateless/02708_dotProduct.reference index 5cc9a9f0502..593071a3521 100644 --- a/tests/queries/0_stateless/02708_dotProduct.reference +++ b/tests/queries/0_stateless/02708_dotProduct.reference @@ -32,3 +32,10 @@ 32 32 32 +-- Tests that trigger special paths +0 61 +1 186 +0 62 +1 187 +0 62 +1 187 diff --git a/tests/queries/0_stateless/02708_dotProduct.sql b/tests/queries/0_stateless/02708_dotProduct.sql index 2035e23cf1d..ac94ecc28d3 100644 --- a/tests/queries/0_stateless/02708_dotProduct.sql +++ b/tests/queries/0_stateless/02708_dotProduct.sql @@ -45,3 +45,12 @@ SELECT '-- Aliases'; SELECT scalarProduct([1, 2, 3], [4, 5, 6]); SELECT scalarProduct((1, 2, 3), (4, 5, 6)); SELECT arrayDotProduct([1, 2, 3], [4, 5, 6]); -- actually no alias but the internal function for arrays + +SELECT '-- Tests that trigger special paths'; +DROP TABLE IF EXISTS tab; +CREATE TABLE tab(id UInt64, vec Array(Float32)) ENGINE = MergeTree ORDER BY id; +INSERT INTO tab VALUES (0, [0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0]) (1, [5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]); +SELECT id, arrayDotProduct(vec, vec) FROM tab ORDER BY id; -- non-const / non-const +SELECT id, arrayDotProduct([5.0, 2.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]::Array(Float32), vec) FROM tab ORDER BY id; -- const / non-const +SELECT id, arrayDotProduct([5.0, 2.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]::Array(Float64), vec) FROM tab ORDER BY id; -- const / non-const +DROP TABLE tab; From 87f3c957c7dc3d16d9967723e30215e12a0b5dc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Wed, 6 Mar 2024 21:16:22 +0100 Subject: [PATCH 0121/1081] Blind experiment --- base/base/itoa.h | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/base/base/itoa.h b/base/base/itoa.h index 513070c99d9..c450090d82f 100644 --- a/base/base/itoa.h +++ b/base/base/itoa.h @@ -349,16 +349,32 @@ namespace convert template -static inline char * writeUIntText(T x, char * p) +static inline char * writeUIntText(T _x, char * p) { - static_assert(is_unsigned_v); +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wbit-int-extension" + int len = digits10(_x); + static_assert(std::is_same_v || std::is_same_v); + using T_ = std::conditional_t, unsigned __int128, unsigned _BitInt(256)>; +#pragma clang diagnostic pop - int len = digits10(x); - auto * pp = p + len; - while (x >= 100) + T_ x; + T_ hundred(100ULL); + if constexpr (std::is_same_v) { - const auto i = x % 100; - x /= 100; + x = (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); + } + else + { + x = (T_(_x.items[T::_impl::little(3)]) << 192) + (T_(_x.items[T::_impl::little(2)]) << 128) + + (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); + } + + auto * pp = p + len; + while (x >= hundred) + { + const auto i = x % hundred; + x /= hundred; pp -= 2; outTwoDigits(pp, i); } From c192d0b12532060d14934e60164df7ce771d9399 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 6 Mar 2024 22:29:41 +0100 Subject: [PATCH 0122/1081] impl --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 83 +++++++++++++------ src/Storages/MergeTree/IMergeTreeDataPart.h | 8 +- .../MergeTree/MergeTreeDataPartCompact.cpp | 3 +- .../MergeTree/MergeTreeDataPartCompact.h | 12 +-- .../MergeTree/MergeTreeDataPartWide.cpp | 3 +- .../MergeTree/MergeTreeDataPartWide.h | 4 +- .../MergeTree/MergeTreeIndexGranularity.cpp | 5 +- .../MergeTree/MergeTreeIndexGranularity.h | 2 + .../MergeTreeIndexGranularityInfo.cpp | 14 +++- .../MergeTree/MergeTreeIndexGranularityInfo.h | 4 + 10 files changed, 97 insertions(+), 41 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index a9bdceacef0..3922d5018c6 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -2,38 +2,41 @@ #include #include +#include #include -#include #include +#include +#include #include -#include +#include +#include +#include +#include #include +#include #include #include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include @@ -1966,7 +1969,39 @@ void IMergeTreeDataPart::checkConsistencyBase() const } } -void IMergeTreeDataPart::checkConsistency(bool /* require_part_metadata */) const +void IMergeTreeDataPart::checkConsistency(bool require_part_metadata) const +{ + try + { + checkConsistencyBase(); + doCheckConsistency(require_part_metadata); + } + catch (Exception & e) + { + const auto part_state = fmt::format( + "state: {}, is_unexpected_local_part: {}, is_frozen: {}, is_duplicate: {}", + stateString(), + is_unexpected_local_part, + is_frozen, + is_duplicate, + is_temp); + + const auto debug_info = fmt::format( + "columns: {}, getMarkSizeInBytes: {}, getMarksCount: {}, index_granularity_info: [{}], index_granularity: [{}], " + "part_state: [{}]", + columns.toString(), + index_granularity_info.getMarkSizeInBytes(columns.size()), + index_granularity.getMarksCount(), + index_granularity_info.describe(), + index_granularity.describe(), + part_state); + + e.addMessage(debug_info); + e.rethrow(); + } +} + +void IMergeTreeDataPart::doCheckConsistency(bool /* require_part_metadata */) const { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method 'checkConsistency' is not implemented for part with type {}", getType().toString()); } diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index 91c559d30c8..209c2d9a256 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -498,7 +498,7 @@ public: void writeChecksums(const MergeTreeDataPartChecksums & checksums_, const WriteSettings & settings); /// Checks the consistency of this data part. - virtual void checkConsistency(bool require_part_metadata) const; + void checkConsistency(bool require_part_metadata) const; /// Checks the consistency of this data part, and check the consistency of its projections (if any) as well. void checkConsistencyWithProjections(bool require_part_metadata) const; @@ -586,8 +586,6 @@ protected: void removeIfNeeded(); - void checkConsistencyBase() const; - /// Fill each_columns_size and total_size with sizes from columns files on /// disk using columns and checksums. virtual void calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const = 0; @@ -610,6 +608,8 @@ protected: void initializeIndexGranularityInfo(); + virtual void doCheckConsistency(bool require_part_metadata) const; + private: String mutable_name; mutable MergeTreeDataPartState state{MergeTreeDataPartState::Temporary}; @@ -697,6 +697,8 @@ private: void incrementStateMetric(MergeTreeDataPartState state) const; void decrementStateMetric(MergeTreeDataPartState state) const; + void checkConsistencyBase() const; + /// This ugly flag is needed for debug assertions only mutable bool part_is_probably_removed_from_disk = false; }; diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp index 0ecd7abe183..5d4b602b5b8 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.cpp @@ -157,9 +157,8 @@ std::optional MergeTreeDataPartCompact::getColumnModificationTime(const return getDataPartStorage().getFileLastModified(DATA_FILE_NAME_WITH_EXTENSION).epochTime(); } -void MergeTreeDataPartCompact::checkConsistency(bool require_part_metadata) const +void MergeTreeDataPartCompact::doCheckConsistency(bool require_part_metadata) const { - checkConsistencyBase(); String mrk_file_name = DATA_FILE_NAME + getMarksFileExtension(); if (!checksums.empty()) diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index 35a358b3720..f897bcb0bfd 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -67,14 +67,14 @@ protected: MergeTreeIndexGranularity & index_granularity_, const MergeTreeIndexGranularityInfo & index_granularity_info_, size_t columns_count, const IDataPartStorage & data_part_storage_); -private: - void checkConsistency(bool require_part_metadata) const override; + void doCheckConsistency(bool require_part_metadata) const override; - /// Loads marks index granularity into memory - void loadIndexGranularity() override; + private: + /// Loads marks index granularity into memory + void loadIndexGranularity() override; - /// Compact parts doesn't support per column size, only total size - void calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const override; + /// Compact parts doesn't support per column size, only total size + void calculateEachColumnSizes(ColumnSizeByName & each_columns_size, ColumnSize & total_size) const override; }; } diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp index dc6c1f0019d..0111f1e7b40 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.cpp +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.cpp @@ -182,9 +182,8 @@ MergeTreeDataPartWide::~MergeTreeDataPartWide() removeIfNeeded(); } -void MergeTreeDataPartWide::checkConsistency(bool require_part_metadata) const +void MergeTreeDataPartWide::doCheckConsistency(bool require_part_metadata) const { - checkConsistencyBase(); std::string marks_file_extension = index_granularity_info.mark_type.getFileExtension(); if (!checksums.empty()) diff --git a/src/Storages/MergeTree/MergeTreeDataPartWide.h b/src/Storages/MergeTree/MergeTreeDataPartWide.h index 14147c4ad56..508ea16d2d4 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartWide.h +++ b/src/Storages/MergeTree/MergeTreeDataPartWide.h @@ -62,9 +62,9 @@ protected: MergeTreeIndexGranularity & index_granularity_, MergeTreeIndexGranularityInfo & index_granularity_info_, const IDataPartStorage & data_part_storage_, const std::string & any_column_file_name); -private: - void checkConsistency(bool require_part_metadata) const override; + void doCheckConsistency(bool require_part_metadata) const override; +private: /// Loads marks index granularity into memory void loadIndexGranularity() override; diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp index 5fdd0555777..2a45ab1d927 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranularity.cpp @@ -121,5 +121,8 @@ void MergeTreeIndexGranularity::resizeWithFixedGranularity(size_t size, size_t f } } - +std::string MergeTreeIndexGranularity::describe() const +{ + return fmt::format("initialized: {}, marks_rows_partial_sums: [{}]", initialized, fmt::join(marks_rows_partial_sums, ", ")); +} } diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularity.h b/src/Storages/MergeTree/MergeTreeIndexGranularity.h index f5677995ae0..d67762f7293 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularity.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranularity.h @@ -95,6 +95,8 @@ public: /// Add `size` of marks with `fixed_granularity` rows void resizeWithFixedGranularity(size_t size, size_t fixed_granularity); + + std::string describe() const; }; } diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp index da89d52a9ff..1ff72a4e36d 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.cpp @@ -88,6 +88,10 @@ std::string MarkType::getFileExtension() const } } +std::string MarkType::describe() const +{ + return fmt::format("adaptive: {}, compressed: {}, part_type: {}", adaptive, compressed, part_type); +} std::optional MergeTreeIndexGranularityInfo::getMarksTypeFromFilesystem(const IDataPartStorage & data_part_storage) { @@ -132,10 +136,18 @@ size_t MergeTreeIndexGranularityInfo::getMarkSizeInBytes(size_t columns_num) con throw Exception(ErrorCodes::UNKNOWN_PART_TYPE, "Unknown part type"); } +std::string MergeTreeIndexGranularityInfo::describe() const +{ + return fmt::format( + "mark_type: [{}], index_granularity_bytes: {}, fixed_index_granularity: {}", + mark_type.describe(), + index_granularity_bytes, + fixed_index_granularity); +} + size_t getAdaptiveMrkSizeCompact(size_t columns_num) { /// Each mark contains number of rows in granule and two offsets for every column. return sizeof(UInt64) * (columns_num * 2 + 1); } - } diff --git a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h index af008866919..85006c3ffde 100644 --- a/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h +++ b/src/Storages/MergeTree/MergeTreeIndexGranularityInfo.h @@ -24,6 +24,8 @@ struct MarkType static bool isMarkFileExtension(std::string_view extension); std::string getFileExtension() const; + std::string describe() const; + bool adaptive = false; bool compressed = false; MergeTreeDataPartType::Value part_type = MergeTreeDataPartType::Unknown; @@ -58,6 +60,8 @@ public: size_t getMarkSizeInBytes(size_t columns_num = 1) const; static std::optional getMarksTypeFromFilesystem(const IDataPartStorage & data_part_storage); + + std::string describe() const; }; constexpr inline auto getNonAdaptiveMrkSizeWide() { return sizeof(UInt64) * 2; } From 2b52583e06056e19df97216f41b81102bca8bd9d Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Wed, 6 Mar 2024 23:01:42 +0100 Subject: [PATCH 0123/1081] fix style --- src/Storages/MergeTree/MergeTreeDataPartCompact.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeDataPartCompact.h b/src/Storages/MergeTree/MergeTreeDataPartCompact.h index f897bcb0bfd..8bbec2808d7 100644 --- a/src/Storages/MergeTree/MergeTreeDataPartCompact.h +++ b/src/Storages/MergeTree/MergeTreeDataPartCompact.h @@ -69,7 +69,7 @@ protected: void doCheckConsistency(bool require_part_metadata) const override; - private: +private: /// Loads marks index granularity into memory void loadIndexGranularity() override; From fb17749b50ce1024ef8c0b6f7bb8b7a58321894c Mon Sep 17 00:00:00 2001 From: Nikolay Monkov Date: Thu, 7 Mar 2024 09:45:24 +0500 Subject: [PATCH 0124/1081] file has been reformatted to pass Style check --- tests/ci/docker_server.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/ci/docker_server.py b/tests/ci/docker_server.py index 803dbfcd92a..35c86d8eadd 100644 --- a/tests/ci/docker_server.py +++ b/tests/ci/docker_server.py @@ -216,7 +216,9 @@ def gen_tags(version: ClickHouseVersion, release_type: str) -> List[str]: return tags -def buildx_args(urls: Dict[str, str], arch: str, direct_urls: List[str], version: str) -> List[str]: +def buildx_args( + urls: Dict[str, str], arch: str, direct_urls: List[str], version: str +) -> List[str]: args = [ f"--platform=linux/{arch}", f"--label=build-url={GITHUB_RUN_URL}", @@ -268,7 +270,9 @@ def build_and_push_image( urls = [url for url in direct_urls[arch] if ".deb" in url] else: urls = [url for url in direct_urls[arch] if ".tgz" in url] - cmd_args.extend(buildx_args(repo_urls, arch, direct_urls=urls, version=version.describe)) + cmd_args.extend( + buildx_args(repo_urls, arch, direct_urls=urls, version=version.describe) + ) if not push: cmd_args.append(f"--tag={image.repo}:{arch_tag}") cmd_args.extend( From 50b84954e4810c94c1397504a64ca96e1a0fed55 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 7 Mar 2024 16:29:38 +0800 Subject: [PATCH 0125/1081] Update .reference --- .../0_stateless/02117_show_create_table_system.reference | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/queries/0_stateless/02117_show_create_table_system.reference b/tests/queries/0_stateless/02117_show_create_table_system.reference index 7382b24afbc..5081527ceef 100644 --- a/tests/queries/0_stateless/02117_show_create_table_system.reference +++ b/tests/queries/0_stateless/02117_show_create_table_system.reference @@ -195,6 +195,8 @@ CREATE TABLE system.disks `unreserved_space` UInt64, `keep_free_space` UInt64, `type` String, + `object_storage_type` String, + `metadata_type` String, `is_encrypted` UInt8, `is_read_only` UInt8, `is_write_once` UInt8, From 31ed1966e3c5388e601edd6e97c0497153bb7196 Mon Sep 17 00:00:00 2001 From: kssenii Date: Thu, 7 Mar 2024 16:44:10 +0800 Subject: [PATCH 0126/1081] Fix build --- src/Disks/ObjectStorages/ObjectStorageFactory.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 9d7e714445a..46136ad7b12 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -166,7 +166,7 @@ void registerS3ObjectStorage(ObjectStorageFactory & factory) /// NOTE: should we still perform this check for clickhouse-disks? if (!skip_access_check) - checkS3Capabilities(*object_storage, s3_capabilities, name); + checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name); return object_storage; }); @@ -202,7 +202,7 @@ void registerS3PlainObjectStorage(ObjectStorageFactory & factory) /// NOTE: should we still perform this check for clickhouse-disks? if (!skip_access_check) - checkS3Capabilities(*object_storage, s3_capabilities, name); + checkS3Capabilities(*dynamic_cast(object_storage.get()), s3_capabilities, name); return object_storage; }); From 10b5ce8ab3d1b412f6500d03bc96e205965178d7 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 7 Mar 2024 10:26:50 +0100 Subject: [PATCH 0127/1081] Updated BufferAllocationPolicy --- src/Backups/BackupIO_AzureBlobStorage.cpp | 4 +-- .../BufferAllocationPolicy.cpp} | 25 +++++-------- src/Common/BufferAllocationPolicy.h | 35 +++++++++++++++++++ src/Common/ThreadPoolTaskTracker.h | 2 -- src/Core/Settings.h | 5 +++ .../IO/WriteBufferFromAzureBlobStorage.cpp | 23 +++++++----- .../IO/WriteBufferFromAzureBlobStorage.h | 9 ++--- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 29 ++++++++------- .../AzureBlobStorage/AzureObjectStorage.cpp | 4 +-- .../AzureBlobStorage/AzureObjectStorage.h | 17 +++++++-- src/IO/WriteBufferFromS3.cpp | 7 +++- src/IO/WriteBufferFromS3.h | 13 +------ 12 files changed, 107 insertions(+), 66 deletions(-) rename src/{IO/WriteBufferFromS3BufferAllocationPolicy.cpp => Common/BufferAllocationPolicy.cpp} (74%) create mode 100644 src/Common/BufferAllocationPolicy.h diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index fb36248433d..8d2b217ad21 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -275,11 +275,9 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin return std::make_unique( client, key, - settings->max_single_part_upload_size, - settings->max_unexpected_write_error_retries, DBMS_DEFAULT_BUFFER_SIZE, write_settings, - settings->max_inflight_parts_for_one_file); + settings); } void BackupWriterAzureBlobStorage::removeFile(const String & file_name) diff --git a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp b/src/Common/BufferAllocationPolicy.cpp similarity index 74% rename from src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp rename to src/Common/BufferAllocationPolicy.cpp index 6347c1acfd7..1456233eb03 100644 --- a/src/IO/WriteBufferFromS3BufferAllocationPolicy.cpp +++ b/src/Common/BufferAllocationPolicy.cpp @@ -1,21 +1,17 @@ -#include "config.h" - -#if USE_AWS_S3 - -#include +#include "BufferAllocationPolicy.h" #include -namespace +namespace DB { -class FixedSizeBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy +class FixedSizeBufferAllocationPolicy : public IBufferAllocationPolicy { const size_t buffer_size = 0; size_t buffer_number = 0; public: - explicit FixedSizeBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) + explicit FixedSizeBufferAllocationPolicy(const BufferAllocationSettings & settings_) : buffer_size(settings_.strict_upload_part_size) { chassert(buffer_size > 0); @@ -36,7 +32,7 @@ public: }; -class ExpBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocationPolicy +class ExpBufferAllocationPolicy : public DB::IBufferAllocationPolicy { const size_t first_size = 0; const size_t second_size = 0; @@ -49,7 +45,7 @@ class ExpBufferAllocationPolicy : public DB::WriteBufferFromS3::IBufferAllocatio size_t buffer_number = 0; public: - explicit ExpBufferAllocationPolicy(const DB::S3Settings::RequestSettings::PartUploadSettings & settings_) + explicit ExpBufferAllocationPolicy(const BufferAllocationSettings & settings_) : first_size(std::max(settings_.max_single_part_upload_size, settings_.min_upload_part_size)) , second_size(settings_.min_upload_part_size) , multiply_factor(settings_.upload_part_size_multiply_factor) @@ -92,14 +88,10 @@ public: } }; -} -namespace DB -{ +IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; -WriteBufferFromS3::IBufferAllocationPolicy::~IBufferAllocationPolicy() = default; - -WriteBufferFromS3::IBufferAllocationPolicyPtr WriteBufferFromS3::ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_) +IBufferAllocationPolicyPtr ChooseBufferPolicy(BufferAllocationSettings settings_) { if (settings_.strict_upload_part_size > 0) return std::make_unique(settings_); @@ -109,4 +101,3 @@ WriteBufferFromS3::IBufferAllocationPolicyPtr WriteBufferFromS3::ChooseBufferPol } -#endif diff --git a/src/Common/BufferAllocationPolicy.h b/src/Common/BufferAllocationPolicy.h new file mode 100644 index 00000000000..b759d22ede6 --- /dev/null +++ b/src/Common/BufferAllocationPolicy.h @@ -0,0 +1,35 @@ +#pragma once + +#include "config.h" + +#include "logger_useful.h" + +#include + +namespace DB +{ + +struct BufferAllocationSettings +{ + size_t strict_upload_part_size = 0; + size_t min_upload_part_size = 16 * 1024 * 1024; + size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; + size_t upload_part_size_multiply_factor = 2; + size_t upload_part_size_multiply_parts_count_threshold = 500; + size_t max_single_part_upload_size = 32 * 1024 * 1024; +}; + +class IBufferAllocationPolicy +{ + public: + virtual size_t getBufferNumber() const = 0; + virtual size_t getBufferSize() const = 0; + virtual void nextBuffer() = 0; + virtual ~IBufferAllocationPolicy() = 0; +}; + +using IBufferAllocationPolicyPtr = std::unique_ptr; + +IBufferAllocationPolicyPtr ChooseBufferPolicy(BufferAllocationSettings settings_); + +} diff --git a/src/Common/ThreadPoolTaskTracker.h b/src/Common/ThreadPoolTaskTracker.h index d37b759a913..72591648d30 100644 --- a/src/Common/ThreadPoolTaskTracker.h +++ b/src/Common/ThreadPoolTaskTracker.h @@ -11,8 +11,6 @@ namespace DB { -/// That class is used only in WriteBufferFromS3 for now. -/// Therefore it declared as a part of WriteBufferFromS3. /// TaskTracker takes a Callback which is run by scheduler in some external shared ThreadPool. /// TaskTracker brings the methods waitIfAny, waitAll/safeWaitAll /// to help with coordination of the running tasks. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index b186ca6fe01..8ad08b7e348 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -78,10 +78,15 @@ class IColumn; M(UInt64, distributed_connections_pool_size, 1024, "Maximum number of connections with one remote server in the pool.", 0) \ M(UInt64, connections_with_failover_max_tries, 3, "The maximum number of attempts to connect to replicas.", 0) \ M(UInt64, s3_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to S3 (some implementations does not supports variable size parts).", 0) \ + M(UInt64, azure_strict_upload_part_size, 0, "The exact size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, s3_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \ M(UInt64, s3_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to S3.", 0) \ + M(UInt64, azure_min_upload_part_size, 16*1024*1024, "The minimum size of part to upload during multipart upload to Azure blob storage.", 0) \ + M(UInt64, azure_max_upload_part_size, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage.", 0) \ M(UInt64, s3_upload_part_size_multiply_factor, 2, "Multiply s3_min_upload_part_size by this factor each time s3_multiply_parts_count_threshold parts were uploaded from a single write to S3.", 0) \ M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ + M(UInt64, azure_upload_part_size_multiply_factor, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage.", 0) \ + M(UInt64, azure_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor.", 0) \ M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ M(UInt64, azure_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited.", 0) \ M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp index 74a8949b235..bc11d445a51 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.cpp @@ -27,23 +27,27 @@ struct WriteBufferFromAzureBlobStorage::PartData WriteBufferFromAzureBlobStorage::WriteBufferFromAzureBlobStorage( std::shared_ptr blob_container_client_, const String & blob_path_, - size_t max_single_part_upload_size_, - size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_, - size_t max_inflight_parts_for_one_file_, + std::shared_ptr settings_, ThreadPoolCallbackRunner schedule_) : WriteBufferFromFileBase(buf_size_, nullptr, 0) , log(getLogger("WriteBufferFromAzureBlobStorage")) - , max_single_part_upload_size(max_single_part_upload_size_) - , max_unexpected_write_error_retries(max_unexpected_write_error_retries_) + , buffer_allocation_policy(ChooseBufferPolicy({settings_->strict_upload_part_size, + settings_->min_upload_part_size, + settings_->max_upload_part_size, + settings_->upload_part_size_multiply_factor, + settings_->upload_part_size_multiply_parts_count_threshold, + settings_->max_single_part_upload_size})) + , max_single_part_upload_size(settings_->max_single_part_upload_size) + , max_unexpected_write_error_retries(settings_->max_unexpected_write_error_retries) , blob_path(blob_path_) , write_settings(write_settings_) , blob_container_client(blob_container_client_) , task_tracker( std::make_unique( std::move(schedule_), - max_inflight_parts_for_one_file_, + settings_->max_inflight_parts_for_one_file, limitedLog)) { allocateBuffer(); @@ -119,7 +123,8 @@ void WriteBufferFromAzureBlobStorage::nextImpl() void WriteBufferFromAzureBlobStorage::allocateBuffer() { - memory = Memory(max_single_part_upload_size); + buffer_allocation_policy->nextBuffer(); + memory = Memory(buffer_allocation_policy->getBufferSize()); WriteBuffer::set(memory.data(), memory.size()); } @@ -129,10 +134,10 @@ void WriteBufferFromAzureBlobStorage::reallocateBuffer() if (available() > 0) return; - if (memory.size() == max_single_part_upload_size) + if (memory.size() == buffer_allocation_policy->getBufferSize()) return; - memory.resize(max_single_part_upload_size); + memory.resize(buffer_allocation_policy->getBufferSize()); WriteBuffer::set(memory.data(), memory.size()); diff --git a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h index 4897ca9a846..7223f66693e 100644 --- a/src/Disks/IO/WriteBufferFromAzureBlobStorage.h +++ b/src/Disks/IO/WriteBufferFromAzureBlobStorage.h @@ -12,7 +12,8 @@ #include #include #include - +#include +#include namespace Poco { @@ -32,11 +33,9 @@ public: WriteBufferFromAzureBlobStorage( AzureClientPtr blob_container_client_, const String & blob_path_, - size_t max_single_part_upload_size_, - size_t max_unexpected_write_error_retries_, size_t buf_size_, const WriteSettings & write_settings_, - size_t max_inflight_parts_for_one_file_, + std::shared_ptr settings_, ThreadPoolCallbackRunner schedule_ = {}); ~WriteBufferFromAzureBlobStorage() override; @@ -63,6 +62,8 @@ private: LoggerPtr log; LogSeriesLimiterPtr limitedLog = std::make_shared(log, 1, 5); + IBufferAllocationPolicyPtr buffer_allocation_policy; + const size_t max_single_part_upload_size; const size_t max_unexpected_write_error_retries; const std::string blob_path; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index f99586b2d1a..1d01e2f45e3 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -160,18 +160,23 @@ std::unique_ptr getAzureBlobContainerClient( std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { - return std::make_unique( - config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), - config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), - config.getInt(config_prefix + ".max_single_read_retries", 3), - config.getInt(config_prefix + ".max_single_download_retries", 3), - config.getInt(config_prefix + ".list_object_keys_size", 1000), - config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), - config.getUInt64(config_prefix + ".max_single_part_copy_size", context->getSettings().azure_max_single_part_copy_size), - config.getBool(config_prefix + ".use_native_copy", false), - config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries), - config.getUInt64(config_prefix + ".max_inflight_parts_for_one_file", context->getSettings().azure_max_inflight_parts_for_one_file) - ); + std::unique_ptr settings = std::make_unique(); + settings->max_single_part_upload_size = config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024); + settings->min_bytes_for_seek = config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024); + settings->max_single_read_retries = config.getInt(config_prefix + ".max_single_read_retries", 3); + settings->max_single_download_retries = config.getInt(config_prefix + ".max_single_download_retries", 3); + settings->list_object_keys_size = config.getInt(config_prefix + ".list_object_keys_size", 1000); + settings->min_upload_part_size = config.getUInt64(config_prefix + ".min_upload_part_size", context->getSettings().azure_min_upload_part_size); + settings->max_upload_part_size = config.getUInt64(config_prefix + ".max_upload_part_size", context->getSettings().azure_max_upload_part_size); + settings->max_single_part_copy_size = config.getUInt64(config_prefix + ".max_single_part_copy_size", context->getSettings().azure_max_single_part_copy_size); + settings->use_native_copy = config.getBool(config_prefix + ".use_native_copy", false); + settings->max_unexpected_write_error_retries = config.getUInt64(config_prefix + ".max_unexpected_write_error_retries", context->getSettings().azure_max_unexpected_write_error_retries); + settings->max_inflight_parts_for_one_file = config.getUInt64(config_prefix + ".max_inflight_parts_for_one_file", context->getSettings().azure_max_inflight_parts_for_one_file); + settings->strict_upload_part_size = config.getUInt64(config_prefix + ".strict_upload_part_size", context->getSettings().azure_strict_upload_part_size); + settings->upload_part_size_multiply_factor = config.getUInt64(config_prefix + ".upload_part_size_multiply_factor", context->getSettings().azure_upload_part_size_multiply_factor); + settings->upload_part_size_multiply_parts_count_threshold = config.getUInt64(config_prefix + ".upload_part_size_multiply_parts_count_threshold", context->getSettings().azure_upload_part_size_multiply_parts_count_threshold); + + return settings; } } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 844789ea5b5..15ab55d5611 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -265,11 +265,9 @@ std::unique_ptr AzureObjectStorage::writeObject( /// NO return std::make_unique( client.get(), object.remote_path, - settings.get()->max_single_part_upload_size, - settings.get()->max_unexpected_write_error_retries, buf_size, patchSettings(write_settings), - settings.get()->max_inflight_parts_for_one_file); + settings.get()); } /// Remove file. Throws exception if file doesn't exists or it's a directory. diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 1b473a01304..b97d706a4d9 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -24,21 +24,29 @@ struct AzureObjectStorageSettings int max_single_read_retries_, int max_single_download_retries_, int list_object_keys_size_, + size_t min_upload_part_size_, size_t max_upload_part_size_, size_t max_single_part_copy_size_, bool use_native_copy_, size_t max_unexpected_write_error_retries_, - size_t max_inflight_parts_for_one_file_) + size_t max_inflight_parts_for_one_file_, + size_t strict_upload_part_size_, + size_t upload_part_size_multiply_factor_, + size_t upload_part_size_multiply_parts_count_threshold_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) + , min_upload_part_size(min_upload_part_size_) , max_upload_part_size(max_upload_part_size_) , max_single_part_copy_size(max_single_part_copy_size_) , use_native_copy(use_native_copy_) - , max_unexpected_write_error_retries (max_unexpected_write_error_retries_) - , max_inflight_parts_for_one_file (max_inflight_parts_for_one_file_) + , max_unexpected_write_error_retries(max_unexpected_write_error_retries_) + , max_inflight_parts_for_one_file(max_inflight_parts_for_one_file_) + , strict_upload_part_size(strict_upload_part_size_) + , upload_part_size_multiply_factor(upload_part_size_multiply_factor_) + , upload_part_size_multiply_parts_count_threshold(upload_part_size_multiply_parts_count_threshold_) { } @@ -55,6 +63,9 @@ struct AzureObjectStorageSettings bool use_native_copy = false; size_t max_unexpected_write_error_retries = 4; size_t max_inflight_parts_for_one_file = 20; + size_t strict_upload_part_size = 0; + size_t upload_part_size_multiply_factor = 2; + size_t upload_part_size_multiply_parts_count_threshold = 500; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/IO/WriteBufferFromS3.cpp b/src/IO/WriteBufferFromS3.cpp index 510d9bef4d3..60fa828d6c4 100644 --- a/src/IO/WriteBufferFromS3.cpp +++ b/src/IO/WriteBufferFromS3.cpp @@ -93,7 +93,12 @@ WriteBufferFromS3::WriteBufferFromS3( , write_settings(write_settings_) , client_ptr(std::move(client_ptr_)) , object_metadata(std::move(object_metadata_)) - , buffer_allocation_policy(ChooseBufferPolicy(upload_settings)) + , buffer_allocation_policy(ChooseBufferPolicy({upload_settings.strict_upload_part_size, + upload_settings.min_upload_part_size, + upload_settings.max_upload_part_size, + upload_settings.upload_part_size_multiply_factor, + upload_settings.upload_part_size_multiply_parts_count_threshold, + upload_settings.max_single_part_upload_size})) , task_tracker( std::make_unique( std::move(schedule_), diff --git a/src/IO/WriteBufferFromS3.h b/src/IO/WriteBufferFromS3.h index afd8b9909c1..840274c8ace 100644 --- a/src/IO/WriteBufferFromS3.h +++ b/src/IO/WriteBufferFromS3.h @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -49,18 +50,6 @@ public: std::string getFileName() const override { return key; } void sync() override { next(); } - class IBufferAllocationPolicy - { - public: - virtual size_t getBufferNumber() const = 0; - virtual size_t getBufferSize() const = 0; - virtual void nextBuffer() = 0; - virtual ~IBufferAllocationPolicy() = 0; - }; - using IBufferAllocationPolicyPtr = std::unique_ptr; - - static IBufferAllocationPolicyPtr ChooseBufferPolicy(const S3Settings::RequestSettings::PartUploadSettings & settings_); - private: /// Receives response from the server after sending all data. void finalizeImpl() override; From f2a3ffe9eb79046093e77ed39f2366754e7a8ba2 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Thu, 7 Mar 2024 17:14:12 +0800 Subject: [PATCH 0128/1081] Replace some headers with forward decl --- src/Backups/BackupCoordinationRemote.cpp | 1 + .../NamedCollections/NamedCollectionUtils.cpp | 1 + .../ObjectStorages/ObjectStorageFactory.cpp | 3 +++ src/Formats/ReadSchemaUtils.cpp | 1 + src/Interpreters/DatabaseCatalog.cpp | 1 - src/Interpreters/DatabaseCatalog.h | 10 +++---- src/Processors/QueryPlan/AggregatingStep.cpp | 1 + src/Processors/QueryPlan/CubeStep.cpp | 1 + src/Storages/StorageAzureBlob.cpp | 4 +++ src/Storages/StorageS3.h | 27 ++++++++++--------- 10 files changed, 29 insertions(+), 21 deletions(-) diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index 9c509858b2a..b869f890f56 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -1,6 +1,7 @@ #include #include +#include #include #include diff --git a/src/Common/NamedCollections/NamedCollectionUtils.cpp b/src/Common/NamedCollections/NamedCollectionUtils.cpp index fe0f42467c7..e3ff50f5e3f 100644 --- a/src/Common/NamedCollections/NamedCollectionUtils.cpp +++ b/src/Common/NamedCollections/NamedCollectionUtils.cpp @@ -17,6 +17,7 @@ #include #include +#include namespace fs = std::filesystem; diff --git a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp index 4f198be64fe..5fae257e8d4 100644 --- a/src/Disks/ObjectStorages/ObjectStorageFactory.cpp +++ b/src/Disks/ObjectStorages/ObjectStorageFactory.cpp @@ -21,6 +21,9 @@ #include #include +#include + +namespace fs = std::filesystem; namespace DB { diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 5badf4301bf..736a35927c3 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp index a9fd5c852ba..a5a523b658b 100644 --- a/src/Interpreters/DatabaseCatalog.cpp +++ b/src/Interpreters/DatabaseCatalog.cpp @@ -26,7 +26,6 @@ #include #include -#include "Interpreters/Context_fwd.h" #include "config.h" #if USE_MYSQL diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index 4fe114cc493..6995fc51941 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -1,15 +1,14 @@ #pragma once #include +#include +#include #include #include -#include #include #include -#include "Common/NamePrompter.h" +#include #include -#include "Storages/IStorage.h" -#include "Databases/IDatabase.h" #include #include @@ -23,9 +22,6 @@ #include #include #include -#include - -namespace fs = std::filesystem; namespace DB { diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp index f374a7b7b10..a76bacdd97b 100644 --- a/src/Processors/QueryPlan/AggregatingStep.cpp +++ b/src/Processors/QueryPlan/AggregatingStep.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Processors/QueryPlan/CubeStep.cpp b/src/Processors/QueryPlan/CubeStep.cpp index 0c632c346c7..bf2ce148529 100644 --- a/src/Processors/QueryPlan/CubeStep.cpp +++ b/src/Processors/QueryPlan/CubeStep.cpp @@ -5,6 +5,7 @@ #include #include #include +#include namespace DB { diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 1f0fba99f84..2d4f1db04a1 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -41,6 +41,10 @@ #include #include +#include + +namespace fs = std::filesystem; + using namespace Azure::Storage::Blobs; namespace CurrentMetrics diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index 65fb3b51be2..bf81ead0599 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -4,27 +4,28 @@ #if USE_AWS_S3 -#include - #include - -#include -#include - -#include -#include -#include -#include -#include +#include #include +#include +#include #include #include -#include +#include +#include +#include #include +#include #include #include +#include #include -#include +#include +#include + +#include + +namespace fs = std::filesystem; namespace Aws::S3 { From f0a8d8843de5dffae2e1d4476fb119ad34059340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 7 Mar 2024 00:10:06 +0100 Subject: [PATCH 0129/1081] Not x86_64 are lagging behind in features --- base/base/itoa.h | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/base/base/itoa.h b/base/base/itoa.h index c450090d82f..a36eecaf1e5 100644 --- a/base/base/itoa.h +++ b/base/base/itoa.h @@ -351,12 +351,20 @@ namespace convert template static inline char * writeUIntText(T _x, char * p) { -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wbit-int-extension" int len = digits10(_x); static_assert(std::is_same_v || std::is_same_v); - using T_ = std::conditional_t, unsigned __int128, unsigned _BitInt(256)>; + using T_ = std::conditional_t< + std::is_same_v, + unsigned __int128, +#if defined(__x86_64__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wbit-int-extension" + unsigned _BitInt(256) #pragma clang diagnostic pop +#else + T +#endif + >; T_ x; T_ hundred(100ULL); @@ -366,8 +374,12 @@ static inline char * writeUIntText(T _x, char * p) } else { +#if defined(__x86_64__) x = (T_(_x.items[T::_impl::little(3)]) << 192) + (T_(_x.items[T::_impl::little(2)]) << 128) + (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); +#else + x = _x; +#endif } auto * pp = p + len; From 66dea5111298abd4301df55b5615d158105fe78f Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 7 Mar 2024 12:40:48 +0100 Subject: [PATCH 0130/1081] fix clang-tidy --- src/Functions/array/arrayDotProduct.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index f9a6687e028..8b7c85e05dd 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -322,18 +322,18 @@ private: const auto & offsets_y = array_y.getOffsets(); ColumnArray::Offset prev_offset = 0; - for (size_t row = 0; row < offsets_y.size(); ++row) + for (auto offset_y : offsets_y) { - if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] + if (offsets_x[0] != offset_y - prev_offset) [[unlikely]] { throw Exception( ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Arguments of function {} have different array sizes: {} and {}", getName(), offsets_x[0], - offsets_y[row] - prev_offset); + offset_y - prev_offset); } - prev_offset = offsets_y[row]; + prev_offset = offset_y; } auto col_res = ColumnVector::create(); From 6c69e7d4dcfdfa21cfcaa103fc1cc7c53dfe0291 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 7 Mar 2024 20:29:04 +0800 Subject: [PATCH 0131/1081] detect output format by file extension in clickhouse-local --- programs/local/LocalServer.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 68f0e52ce08..20974dd9751 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -327,6 +327,14 @@ static bool checkIfStdinIsRegularFile() return fstat(STDIN_FILENO, &file_stat) == 0 && S_ISREG(file_stat.st_mode); } + +static bool checkIfStdoutIsRegularFile() +{ + struct stat file_stat; + return fstat(STDOUT_FILENO, &file_stat) == 0 && S_ISREG(file_stat.st_mode); +} + + std::string LocalServer::getInitialCreateTableQuery() { if (!config().has("table-structure") && !config().has("table-file") && !config().has("table-data-format") && (!checkIfStdinIsRegularFile() || queries.empty())) @@ -638,7 +646,14 @@ void LocalServer::processConfig() if (config().has("macros")) global_context->setMacros(std::make_unique(config(), "macros", log)); - format = config().getString("output-format", config().getString("format", is_interactive ? "PrettyCompact" : "TSV")); + if (!config().has("output-format") && !config().has("format") && checkIfStdoutIsRegularFile()) + { + std::optional format_from_file_name; + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileDescriptor(STDOUT_FILENO); + format = format_from_file_name ? *format_from_file_name : "TSV"; + } + else + format = config().getString("output-format", config().getString("format", is_interactive ? "PrettyCompact" : "TSV")); insert_format = "Values"; /// Setting value from cmd arg overrides one from config From 6d5fd2857ed50047d8acf48766165aa815ca30b9 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 7 Mar 2024 20:29:42 +0800 Subject: [PATCH 0132/1081] detect output format by file extension in clickhouse-client --- programs/client/Client.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index a2bd6b6016a..fac34003553 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -50,6 +50,7 @@ #include #include #include +#include namespace fs = std::filesystem; using namespace std::literals; @@ -1137,6 +1138,13 @@ void Client::processOptions(const OptionsDescription & options_description, } +static bool checkIfStdoutIsRegularFile() +{ + struct stat file_stat; + return fstat(STDOUT_FILENO, &file_stat) == 0 && S_ISREG(file_stat.st_mode); +} + + void Client::processConfig() { if (!queries.empty() && config().has("queries-file")) @@ -1173,7 +1181,14 @@ void Client::processConfig() pager = config().getString("pager", ""); is_default_format = !config().has("vertical") && !config().has("format"); - if (config().has("vertical")) + if (is_default_format && checkIfStdoutIsRegularFile()) + { + is_default_format = false; + std::optional format_from_file_name; + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileDescriptor(STDOUT_FILENO); + format = format_from_file_name ? *format_from_file_name : "TabSeparated"; + } + else if (config().has("vertical")) format = config().getString("format", "Vertical"); else format = config().getString("format", is_interactive ? "PrettyCompact" : "TabSeparated"); From 112c1efb7da2619cb67a48ff7fbe65ecea8e44a9 Mon Sep 17 00:00:00 2001 From: HowePa <2873679104@qq.com> Date: Thu, 7 Mar 2024 20:30:24 +0800 Subject: [PATCH 0133/1081] test detect output format by file extension --- ..._output_format_by_file_extension.reference | 20 +++++++++++++++++++ ..._detect_output_format_by_file_extension.sh | 13 ++++++++++++ 2 files changed, 33 insertions(+) create mode 100644 tests/queries/0_stateless/02181_detect_output_format_by_file_extension.reference create mode 100755 tests/queries/0_stateless/02181_detect_output_format_by_file_extension.sh diff --git a/tests/queries/0_stateless/02181_detect_output_format_by_file_extension.reference b/tests/queries/0_stateless/02181_detect_output_format_by_file_extension.reference new file mode 100644 index 00000000000..7b36cc96f5e --- /dev/null +++ b/tests/queries/0_stateless/02181_detect_output_format_by_file_extension.reference @@ -0,0 +1,20 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 diff --git a/tests/queries/0_stateless/02181_detect_output_format_by_file_extension.sh b/tests/queries/0_stateless/02181_detect_output_format_by_file_extension.sh new file mode 100755 index 00000000000..ec1edd710a1 --- /dev/null +++ b/tests/queries/0_stateless/02181_detect_output_format_by_file_extension.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash +# Tags: no-parallel, no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + + +$CLICKHOUSE_LOCAL -q "select * from numbers(10)" > $CLICKHOUSE_TMP/data.parquet +$CLICKHOUSE_LOCAL -q "select * from table" < $CLICKHOUSE_TMP/data.parquet + +$CLICKHOUSE_CLIENT -q "select * from numbers(10)" > $CLICKHOUSE_TMP/data.parquet +$CLICKHOUSE_LOCAL -q "select * from table" < $CLICKHOUSE_TMP/data.parquet From 930deee699be05398aac334ce9e025d084c68a30 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 7 Mar 2024 22:02:10 +0800 Subject: [PATCH 0134/1081] fix bugs --- src/Columns/ColumnArray.cpp | 63 ++++++++++++++++++------------------- src/Columns/ColumnArray.h | 2 +- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp index 44b17c89ae1..0214375122f 100644 --- a/src/Columns/ColumnArray.cpp +++ b/src/Columns/ColumnArray.cpp @@ -43,29 +43,34 @@ namespace ErrorCodes static constexpr size_t max_array_size_as_field = 1000000; -ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column) +ColumnArray::ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column, bool check_offsets) : data(std::move(nested_column)), offsets(std::move(offsets_column)) { - const ColumnOffsets * offsets_concrete = typeid_cast(offsets.get()); - - if (!offsets_concrete) - throw Exception(ErrorCodes::LOGICAL_ERROR, "offsets_column must be a ColumnUInt64"); - - if (!offsets_concrete->empty() && data && !data->empty()) + if (check_offsets) { - Offset last_offset = offsets_concrete->getData().back(); + const ColumnOffsets * offsets_concrete = typeid_cast(offsets.get()); - /// This will also prevent possible overflow in offset. - if (data->size() != last_offset) - throw Exception(ErrorCodes::LOGICAL_ERROR, - "offsets_column has data inconsistent with nested_column. Data size: {}, last offset: {}", - data->size(), last_offset); + if (!offsets_concrete) + throw Exception(ErrorCodes::LOGICAL_ERROR, "offsets_column must be a ColumnUInt64"); + + if (!offsets_concrete->empty() && data && !data->empty()) + { + Offset last_offset = offsets_concrete->getData().back(); + + /// This will also prevent possible overflow in offset. + if (data->size() != last_offset) + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "offsets_column has data inconsistent with nested_column. Data size: {}, last offset: {}", + data->size(), + last_offset); + } + + /** NOTE + * Arrays with constant value are possible and used in implementation of higher order functions (see FunctionReplicate). + * But in most cases, arrays with constant value are unexpected and code will work wrong. Use with caution. + */ } - - /** NOTE - * Arrays with constant value are possible and used in implementation of higher order functions (see FunctionReplicate). - * But in most cases, arrays with constant value are unexpected and code will work wrong. Use with caution. - */ } ColumnArray::ColumnArray(MutableColumnPtr && nested_column) @@ -425,20 +430,14 @@ void ColumnArray::insertManyFromTuple(const ColumnArray & src, size_t position, if (tuple_size != src_tuple_size) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Nested tuple size mismatch: {} vs {}", tuple_size, src_tuple_size); - MutableColumns temporary_arrays(tuple_size); - Columns src_temporary_arrays(tuple_size); - for (size_t i = 0; i < tuple_size; ++i) - { - temporary_arrays[i] = ColumnArray::create(tuple.getColumn(i).assumeMutable(), getOffsetsPtr()->assumeMutable()); - src_temporary_arrays[i] = ColumnArray::create(src_tuple.getColumn(i).assumeMutable(), src.getOffsetsPtr()->assumeMutable()); - assert_cast(*temporary_arrays[i]) - .insertManyFromImpl(assert_cast(*src_temporary_arrays[i]), position, length, false); - } - Columns tuple_columns(tuple_size); for (size_t i = 0; i < tuple_size; ++i) - tuple_columns[i] = assert_cast(*temporary_arrays[i]).getDataPtr(); - + { + auto array_of_element = ColumnArray(tuple.getColumn(i).assumeMutable(), getOffsetsPtr()->assumeMutable(), false); + auto src_array_of_element = ColumnArray(src_tuple.getColumn(i).assumeMutable(), src.getOffsetsPtr()->assumeMutable()); + array_of_element.insertManyFromImpl(src_array_of_element, position, length, false); + tuple_columns[i] = array_of_element.getDataPtr(); + } getDataPtr() = ColumnTuple::create(std::move(tuple_columns)); } @@ -448,12 +447,12 @@ void ColumnArray::insertManyFromNullable(const ColumnArray & src, size_t positio const ColumnNullable & src_nullable = assert_cast(src.getData()); /// Process nested column without updating array offsets - auto array_of_nested = ColumnArray(nullable.getNestedColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()); + auto array_of_nested = ColumnArray(nullable.getNestedColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable(), false); auto src_array_of_nested = ColumnArray(src_nullable.getNestedColumnPtr()->assumeMutable(), src.getOffsetsPtr()->assumeMutable()); array_of_nested.insertManyFromImpl(src_array_of_nested, position, length, false); /// Process null map column without updating array offsets - auto array_of_null_map = ColumnArray(nullable.getNullMapColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable()); + auto array_of_null_map = ColumnArray(nullable.getNullMapColumnPtr()->assumeMutable(), getOffsetsPtr()->assumeMutable(), false); auto src_array_of_null_map = ColumnArray(src_nullable.getNullMapColumnPtr()->assumeMutable(), src.getOffsetsPtr()->assumeMutable()); array_of_null_map.insertManyFromImpl(src_array_of_null_map, position, length, false); diff --git a/src/Columns/ColumnArray.h b/src/Columns/ColumnArray.h index 765f86ec552..8c4d103e7d0 100644 --- a/src/Columns/ColumnArray.h +++ b/src/Columns/ColumnArray.h @@ -21,7 +21,7 @@ private: friend class COWHelper, ColumnArray>; /** Create an array column with specified values and offsets. */ - ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column); + ColumnArray(MutableColumnPtr && nested_column, MutableColumnPtr && offsets_column, bool check_offsets = true); /** Create an empty column of arrays with the type of values as in the column `nested_column` */ explicit ColumnArray(MutableColumnPtr && nested_column); From cd9d9018e0db8139e48cb722e9e9685d2a212c8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 7 Mar 2024 17:15:42 +0100 Subject: [PATCH 0135/1081] Be able to iterate --- base/base/CMakeLists.txt | 1 + base/base/itoa.cpp | 503 +++++++++++++++++++++++++++++++++++ base/base/itoa.h | 498 +++------------------------------- src/Functions/CMakeLists.txt | 1 + 4 files changed, 540 insertions(+), 463 deletions(-) create mode 100644 base/base/itoa.cpp diff --git a/base/base/CMakeLists.txt b/base/base/CMakeLists.txt index 548ba01d86a..55d046767b8 100644 --- a/base/base/CMakeLists.txt +++ b/base/base/CMakeLists.txt @@ -19,6 +19,7 @@ set (SRCS getPageSize.cpp getThreadId.cpp int8_to_string.cpp + itoa.cpp JSON.cpp mremap.cpp phdr_cache.cpp diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp new file mode 100644 index 00000000000..9fefc9f0f07 --- /dev/null +++ b/base/base/itoa.cpp @@ -0,0 +1,503 @@ +// Based on https://github.com/amdn/itoa and combined with our optimizations +// +//=== itoa.h - Fast integer to ascii conversion --*- C++ -*-// +// +// The MIT License (MIT) +// Copyright (c) 2016 Arturo Martin-de-Nicolas +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included +// in all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + + +template +int digits10(T x) +{ + if (x < T(10ULL)) + return 1; + if (x < T(100ULL)) + return 2; + if constexpr (sizeof(T) == 1) + return 3; + else + { + if (x < T(1000ULL)) + return 3; + + if (x < T(1000000000000ULL)) + { + if (x < T(100000000ULL)) + { + if (x < T(1000000ULL)) + { + if (x < T(10000ULL)) + return 4; + else + return 5 + (x >= T(100000ULL)); + } + + return 7 + (x >= T(10000000ULL)); + } + + if (x < T(10000000000ULL)) + return 9 + (x >= T(1000000000ULL)); + + return 11 + (x >= T(100000000000ULL)); + } + + return 12 + digits10(x / T(1000000000000ULL)); + } +} + + +namespace +{ + +template +static constexpr T pow10(size_t x) +{ + return x ? 10 * pow10(x - 1) : 1; +} + +// Division by a power of 10 is implemented using a multiplicative inverse. +// This strength reduction is also done by optimizing compilers, but +// presently the fastest results are produced by using the values +// for the multiplication and the shift as given by the algorithm +// described by Agner Fog in "Optimizing Subroutines in Assembly Language" +// +// http://www.agner.org/optimize/optimizing_assembly.pdf +// +// "Integer division by a constant (all processors) +// A floating point number can be divided by a constant by multiplying +// with the reciprocal. If we want to do the same with integers, we have +// to scale the reciprocal by 2n and then shift the product to the right +// by n. There are various algorithms for finding a suitable value of n +// and compensating for rounding errors. The algorithm described below +// was invented by Terje Mathisen, Norway, and not published elsewhere." + +/// Division by constant is performed by: +/// 1. Adding 1 if needed; +/// 2. Multiplying by another constant; +/// 3. Shifting right by another constant. +template +struct Division +{ + static constexpr bool add{add_}; + static constexpr UInt multiplier{multiplier_}; + static constexpr unsigned shift{shift_}; +}; + +/// Select a type with appropriate number of bytes from the list of types. +/// First parameter is the number of bytes requested. Then goes a list of types with 1, 2, 4, ... number of bytes. +/// Example: SelectType<4, uint8_t, uint16_t, uint32_t, uint64_t> will select uint32_t. +template +struct SelectType +{ + using Result = typename SelectType::Result; +}; + +template +struct SelectType<1, T, Ts...> +{ + using Result = T; +}; + + +/// Division by 10^N where N is the size of the type. +template +using DivisionBy10PowN = typename SelectType< + N, + Division, /// divide by 10 + Division, /// divide by 100 + Division, /// divide by 10000 + Division /// divide by 100000000 + >::Result; + +template +using UnsignedOfSize = typename SelectType::Result; + +/// Holds the result of dividing an unsigned N-byte variable by 10^N resulting in +template +struct QuotientAndRemainder +{ + UnsignedOfSize quotient; // quotient with fewer than 2*N decimal digits + UnsignedOfSize remainder; // remainder with at most N decimal digits +}; + +template +QuotientAndRemainder static inline split(UnsignedOfSize value) +{ + constexpr DivisionBy10PowN division; + + UnsignedOfSize quotient = (division.multiplier * (UnsignedOfSize<2 * N>(value) + division.add)) >> division.shift; + UnsignedOfSize remainder = static_cast>(value - quotient * pow10>(N)); + + return {quotient, remainder}; +} + + +static inline char * outDigit(char * p, uint8_t value) +{ + *p = '0' + value; + ++p; + return p; +} + +// Using a lookup table to convert binary numbers from 0 to 99 +// into ascii characters as described by Andrei Alexandrescu in +// https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/ + +static const char digits[201] = "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; + +static inline char * outTwoDigits(char * p, uint8_t value) +{ + memcpy(p, &digits[value * 2], 2); + p += 2; + return p; +} + + +namespace convert +{ +template +static char * head(char * p, UInt u); +template +static char * tail(char * p, UInt u); + +//===----------------------------------------------------------===// +// head: find most significant digit, skip leading zeros +//===----------------------------------------------------------===// + +// "x" contains quotient and remainder after division by 10^N +// quotient is less than 10^N +template +static inline char * head(char * p, QuotientAndRemainder x) +{ + p = head(p, UnsignedOfSize(x.quotient)); + p = tail(p, x.remainder); + return p; +} + +// "u" is less than 10^2*N +template +static inline char * head(char * p, UInt u) +{ + return u < pow10>(N) ? head(p, UnsignedOfSize(u)) : head(p, split(u)); +} + +// recursion base case, selected when "u" is one byte +template <> +inline char * head, 1>(char * p, UnsignedOfSize<1> u) +{ + return u < 10 ? outDigit(p, u) : outTwoDigits(p, u); +} + +//===----------------------------------------------------------===// +// tail: produce all digits including leading zeros +//===----------------------------------------------------------===// + +// recursive step, "u" is less than 10^2*N +template +static inline char * tail(char * p, UInt u) +{ + QuotientAndRemainder x = split(u); + p = tail(p, UnsignedOfSize(x.quotient)); + p = tail(p, x.remainder); + return p; +} + +// recursion base case, selected when "u" is one byte +template <> +inline char * tail, 1>(char * p, UnsignedOfSize<1> u) +{ + return outTwoDigits(p, u); +} + +//===----------------------------------------------------------===// +// large values are >= 10^2*N +// where x contains quotient and remainder after division by 10^N +//===----------------------------------------------------------===// + +template +static inline char * large(char * p, QuotientAndRemainder x) +{ + QuotientAndRemainder y = split(x.quotient); + p = head(p, UnsignedOfSize(y.quotient)); + p = tail(p, y.remainder); + p = tail(p, x.remainder); + return p; +} + +//===----------------------------------------------------------===// +// handle values of "u" that might be >= 10^2*N +// where N is the size of "u" in bytes +//===----------------------------------------------------------===// + +template +static inline char * uitoa(char * p, UInt u) +{ + if (u < pow10>(N)) + return head(p, UnsignedOfSize(u)); + QuotientAndRemainder x = split(u); + + return u < pow10>(2 * N) ? head(p, x) : large(p, x); +} + +// selected when "u" is one byte +template <> +inline char * uitoa, 1>(char * p, UnsignedOfSize<1> u) +{ + if (u < 10) + return outDigit(p, u); + else if (u < 100) + return outTwoDigits(p, u); + else + { + p = outDigit(p, u / 100); + p = outTwoDigits(p, u % 100); + return p; + } +} + +//===----------------------------------------------------------===// +// handle unsigned and signed integral operands +//===----------------------------------------------------------===// + +// itoa: handle unsigned integral operands (selected by SFINAE) +template && std::is_integral_v> * = nullptr> +static inline char * itoa(U u, char * p) +{ + return convert::uitoa(p, u); +} + +// itoa: handle signed integral operands (selected by SFINAE) +template && std::is_integral_v> * = nullptr> +static inline char * itoa(I i, char * p) +{ + // Need "mask" to be filled with a copy of the sign bit. + // If "i" is a negative value, then the result of "operator >>" + // is implementation-defined, though usually it is an arithmetic + // right shift that replicates the sign bit. + // Use a conditional expression to be portable, + // a good optimizing compiler generates an arithmetic right shift + // and avoids the conditional branch. + UnsignedOfSize mask = i < 0 ? ~UnsignedOfSize(0) : 0; + // Now get the absolute value of "i" and cast to unsigned type UnsignedOfSize. + // Cannot use std::abs() because the result is undefined + // in 2's complement systems for the most-negative value. + // Want to avoid conditional branch for performance reasons since + // CPU branch prediction will be ineffective when negative values + // occur randomly. + // Let "u" be "i" cast to unsigned type UnsignedOfSize. + // Subtract "u" from 2*u if "i" is positive or 0 if "i" is negative. + // This yields the absolute value with the desired type without + // using a conditional branch and without invoking undefined or + // implementation defined behavior: + UnsignedOfSize u = ((2 * UnsignedOfSize(i)) & ~mask) - UnsignedOfSize(i); + // Unconditionally store a minus sign when producing digits + // in a forward direction and increment the pointer only if + // the value is in fact negative. + // This avoids a conditional branch and is safe because we will + // always produce at least one digit and it will overwrite the + // minus sign when the value is not negative. + *p = '-'; + p += (mask & 1); + p = convert::uitoa(p, u); + return p; +} +} + + +template +static NO_INLINE char * writeUIntText(T _x, char * p) +{ + static_assert(std::is_same_v || std::is_same_v); + using T_ = std::conditional_t< + std::is_same_v, + unsigned __int128, +#if defined(__x86_64__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wbit-int-extension" + unsigned _BitInt(256) +# pragma clang diagnostic pop +#else + T +#endif + >; + + T_ x; + T_ hundred(100ULL); + if constexpr (std::is_same_v) + { + x = (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); + } + else + { +#if defined(__x86_64__) + x = (T_(_x.items[T::_impl::little(3)]) << 192) + (T_(_x.items[T::_impl::little(2)]) << 128) + + (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); +#else + x = _x; +#endif + } + + int len = digits10(x); + auto * pp = p + len; + while (x >= hundred) + { + const auto i = x % hundred; + x /= hundred; + pp -= 2; + outTwoDigits(pp, i); + } + if (x < 10) + *p = '0' + x; + else + outTwoDigits(p, x); + return p + len; +} + +static ALWAYS_INLINE inline char * writeLeadingMinus(char * pos) +{ + *pos = '-'; + return pos + 1; +} + +template +static ALWAYS_INLINE inline char * writeSIntText(T x, char * pos) +{ + static_assert(std::is_same_v || std::is_same_v); + + using UnsignedT = make_unsigned_t; + static constexpr T min_int = UnsignedT(1) << (sizeof(T) * 8 - 1); + + if (unlikely(x == min_int)) + { + if constexpr (std::is_same_v) + { + const char * res = "-170141183460469231731687303715884105728"; + memcpy(pos, res, strlen(res)); + return pos + strlen(res); + } + else if constexpr (std::is_same_v) + { + const char * res = "-57896044618658097711785492504343953926634992332820282019728792003956564819968"; + memcpy(pos, res, strlen(res)); + return pos + strlen(res); + } + } + + if (x < 0) + { + x = -x; + pos = writeLeadingMinus(pos); + } + return writeUIntText(UnsignedT(x), pos); +} +} + +template +char * itoa(T i, char * p) +{ + return convert::itoa(i, p); +} + +template <> +char * itoa(UInt8 i, char * p) +{ + return convert::itoa(uint8_t(i), p); +} + +template <> +char * itoa(Int8 i, char * p) +{ + return convert::itoa(int8_t(i), p); +} + +template <> +char * itoa(UInt128 i, char * p) +{ + return writeUIntText(i, p); +} + +template <> +char * itoa(Int128 i, char * p) +{ + return writeSIntText(i, p); +} + +template <> +char * itoa(UInt256 i, char * p) +{ + return writeUIntText(i, p); +} + +template <> +char * itoa(Int256 i, char * p) +{ + return writeSIntText(i, p); +} + +#define FOR_MISSING_INTEGER_TYPES(M) \ + M(int8_t) \ + M(uint8_t) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(Int16) \ + M(Int32) \ + M(Int64) + +#define INSTANTIATION(T) template char * itoa(T i, char * p); +FOR_MISSING_INTEGER_TYPES(INSTANTIATION) + +#undef FOR_MISSING_INTEGER_TYPES +#undef INSTANTIATION + + +#define DIGITS_INTEGER_TYPES(M) \ + M(uint8_t) \ + M(UInt8) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(UInt128) \ + M(UInt256) + +#define INSTANTIATION(T) template int digits10(T x); +DIGITS_INTEGER_TYPES(INSTANTIATION) + +#undef DIGITS_INTEGER_TYPES +#undef INSTANTIATION diff --git a/base/base/itoa.h b/base/base/itoa.h index a36eecaf1e5..71603cdeb88 100644 --- a/base/base/itoa.h +++ b/base/base/itoa.h @@ -1,474 +1,46 @@ #pragma once -// Based on https://github.com/amdn/itoa and combined with our optimizations -// -//=== itoa.h - Fast integer to ascii conversion --*- C++ -*-// -// -// The MIT License (MIT) -// Copyright (c) 2016 Arturo Martin-de-Nicolas -// -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: -// -// The above copyright notice and this permission notice shall be included -// in all copies or substantial portions of the Software. -// -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include #include +template char * itoa(T i, char * p); -template -inline int digits10(T x) -{ - if (x < 10ULL) - return 1; - if (x < 100ULL) - return 2; - if (x < 1000ULL) - return 3; +template <> char * itoa(UInt8 i, char * p); +template <> char * itoa(Int8 i, char * p); +template <> char * itoa(UInt128 i, char * p); +template <> char * itoa(Int128 i, char * p); +template <> char * itoa(UInt256 i, char * p); +template <> char * itoa(Int256 i, char * p); - if (x < 1000000000000ULL) - { - if (x < 100000000ULL) - { - if (x < 1000000ULL) - { - if (x < 10000ULL) - return 4; - else - return 5 + (x >= 100000ULL); - } +#define FOR_MISSING_INTEGER_TYPES(M) \ + M(int8_t) \ + M(uint8_t) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(Int16) \ + M(Int32) \ + M(Int64) - return 7 + (x >= 10000000ULL); - } +#define INSTANTIATION(T) \ + extern template char * itoa(T i, char * p); +FOR_MISSING_INTEGER_TYPES(INSTANTIATION) - if (x < 10000000000ULL) - return 9 + (x >= 1000000000ULL); - - return 11 + (x >= 100000000000ULL); - } - - return 12 + digits10(x / 1000000000000ULL); -} +#undef FOR_MISSING_INTEGER_TYPES +#undef INSTANTIATION -namespace impl -{ +template int digits10(T x); -template -static constexpr T pow10(size_t x) -{ - return x ? 10 * pow10(x - 1) : 1; -} - -// Division by a power of 10 is implemented using a multiplicative inverse. -// This strength reduction is also done by optimizing compilers, but -// presently the fastest results are produced by using the values -// for the multiplication and the shift as given by the algorithm -// described by Agner Fog in "Optimizing Subroutines in Assembly Language" -// -// http://www.agner.org/optimize/optimizing_assembly.pdf -// -// "Integer division by a constant (all processors) -// A floating point number can be divided by a constant by multiplying -// with the reciprocal. If we want to do the same with integers, we have -// to scale the reciprocal by 2n and then shift the product to the right -// by n. There are various algorithms for finding a suitable value of n -// and compensating for rounding errors. The algorithm described below -// was invented by Terje Mathisen, Norway, and not published elsewhere." - -/// Division by constant is performed by: -/// 1. Adding 1 if needed; -/// 2. Multiplying by another constant; -/// 3. Shifting right by another constant. -template -struct Division -{ - static constexpr bool add{add_}; - static constexpr UInt multiplier{multiplier_}; - static constexpr unsigned shift{shift_}; -}; - -/// Select a type with appropriate number of bytes from the list of types. -/// First parameter is the number of bytes requested. Then goes a list of types with 1, 2, 4, ... number of bytes. -/// Example: SelectType<4, uint8_t, uint16_t, uint32_t, uint64_t> will select uint32_t. -template -struct SelectType -{ - using Result = typename SelectType::Result; -}; - -template -struct SelectType<1, T, Ts...> -{ - using Result = T; -}; - - -/// Division by 10^N where N is the size of the type. -template -using DivisionBy10PowN = typename SelectType -< - N, - Division, /// divide by 10 - Division, /// divide by 100 - Division, /// divide by 10000 - Division /// divide by 100000000 ->::Result; - -template -using UnsignedOfSize = typename SelectType -< - N, - uint8_t, - uint16_t, - uint32_t, - uint64_t, - __uint128_t ->::Result; - -/// Holds the result of dividing an unsigned N-byte variable by 10^N resulting in -template -struct QuotientAndRemainder -{ - UnsignedOfSize quotient; // quotient with fewer than 2*N decimal digits - UnsignedOfSize remainder; // remainder with at most N decimal digits -}; - -template -QuotientAndRemainder static inline split(UnsignedOfSize value) -{ - constexpr DivisionBy10PowN division; - - UnsignedOfSize quotient = (division.multiplier * (UnsignedOfSize<2 * N>(value) + division.add)) >> division.shift; - UnsignedOfSize remainder = static_cast>(value - quotient * pow10>(N)); - - return {quotient, remainder}; -} - - -static inline char * outDigit(char * p, uint8_t value) -{ - *p = '0' + value; - ++p; - return p; -} - -// Using a lookup table to convert binary numbers from 0 to 99 -// into ascii characters as described by Andrei Alexandrescu in -// https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/ - -static const char digits[201] = "00010203040506070809" - "10111213141516171819" - "20212223242526272829" - "30313233343536373839" - "40414243444546474849" - "50515253545556575859" - "60616263646566676869" - "70717273747576777879" - "80818283848586878889" - "90919293949596979899"; - -static inline char * outTwoDigits(char * p, uint8_t value) -{ - memcpy(p, &digits[value * 2], 2); - p += 2; - return p; -} - - -namespace convert -{ - template static char * head(char * p, UInt u); - template static char * tail(char * p, UInt u); - - //===----------------------------------------------------------===// - // head: find most significant digit, skip leading zeros - //===----------------------------------------------------------===// - - // "x" contains quotient and remainder after division by 10^N - // quotient is less than 10^N - template - static inline char * head(char * p, QuotientAndRemainder x) - { - p = head(p, UnsignedOfSize(x.quotient)); - p = tail(p, x.remainder); - return p; - } - - // "u" is less than 10^2*N - template - static inline char * head(char * p, UInt u) - { - return u < pow10>(N) - ? head(p, UnsignedOfSize(u)) - : head(p, split(u)); - } - - // recursion base case, selected when "u" is one byte - template <> - inline char * head, 1>(char * p, UnsignedOfSize<1> u) - { - return u < 10 - ? outDigit(p, u) - : outTwoDigits(p, u); - } - - //===----------------------------------------------------------===// - // tail: produce all digits including leading zeros - //===----------------------------------------------------------===// - - // recursive step, "u" is less than 10^2*N - template - static inline char * tail(char * p, UInt u) - { - QuotientAndRemainder x = split(u); - p = tail(p, UnsignedOfSize(x.quotient)); - p = tail(p, x.remainder); - return p; - } - - // recursion base case, selected when "u" is one byte - template <> - inline char * tail, 1>(char * p, UnsignedOfSize<1> u) - { - return outTwoDigits(p, u); - } - - //===----------------------------------------------------------===// - // large values are >= 10^2*N - // where x contains quotient and remainder after division by 10^N - //===----------------------------------------------------------===// - - template - static inline char * large(char * p, QuotientAndRemainder x) - { - QuotientAndRemainder y = split(x.quotient); - p = head(p, UnsignedOfSize(y.quotient)); - p = tail(p, y.remainder); - p = tail(p, x.remainder); - return p; - } - - //===----------------------------------------------------------===// - // handle values of "u" that might be >= 10^2*N - // where N is the size of "u" in bytes - //===----------------------------------------------------------===// - - template - static inline char * uitoa(char * p, UInt u) - { - if (u < pow10>(N)) - return head(p, UnsignedOfSize(u)); - QuotientAndRemainder x = split(u); - - return u < pow10>(2 * N) - ? head(p, x) - : large(p, x); - } - - // selected when "u" is one byte - template <> - inline char * uitoa, 1>(char * p, UnsignedOfSize<1> u) - { - if (u < 10) - return outDigit(p, u); - else if (u < 100) - return outTwoDigits(p, u); - else - { - p = outDigit(p, u / 100); - p = outTwoDigits(p, u % 100); - return p; - } - } - - //===----------------------------------------------------------===// - // handle unsigned and signed integral operands - //===----------------------------------------------------------===// - - // itoa: handle unsigned integral operands (selected by SFINAE) - template && std::is_integral_v> * = nullptr> - static inline char * itoa(U u, char * p) - { - return convert::uitoa(p, u); - } - - // itoa: handle signed integral operands (selected by SFINAE) - template && std::is_integral_v> * = nullptr> - static inline char * itoa(I i, char * p) - { - // Need "mask" to be filled with a copy of the sign bit. - // If "i" is a negative value, then the result of "operator >>" - // is implementation-defined, though usually it is an arithmetic - // right shift that replicates the sign bit. - // Use a conditional expression to be portable, - // a good optimizing compiler generates an arithmetic right shift - // and avoids the conditional branch. - UnsignedOfSize mask = i < 0 ? ~UnsignedOfSize(0) : 0; - // Now get the absolute value of "i" and cast to unsigned type UnsignedOfSize. - // Cannot use std::abs() because the result is undefined - // in 2's complement systems for the most-negative value. - // Want to avoid conditional branch for performance reasons since - // CPU branch prediction will be ineffective when negative values - // occur randomly. - // Let "u" be "i" cast to unsigned type UnsignedOfSize. - // Subtract "u" from 2*u if "i" is positive or 0 if "i" is negative. - // This yields the absolute value with the desired type without - // using a conditional branch and without invoking undefined or - // implementation defined behavior: - UnsignedOfSize u = ((2 * UnsignedOfSize(i)) & ~mask) - UnsignedOfSize(i); - // Unconditionally store a minus sign when producing digits - // in a forward direction and increment the pointer only if - // the value is in fact negative. - // This avoids a conditional branch and is safe because we will - // always produce at least one digit and it will overwrite the - // minus sign when the value is not negative. - *p = '-'; - p += (mask & 1); - p = convert::uitoa(p, u); - return p; - } -} - - -template -static inline char * writeUIntText(T _x, char * p) -{ - int len = digits10(_x); - static_assert(std::is_same_v || std::is_same_v); - using T_ = std::conditional_t< - std::is_same_v, - unsigned __int128, -#if defined(__x86_64__) -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wbit-int-extension" - unsigned _BitInt(256) -#pragma clang diagnostic pop -#else - T -#endif - >; - - T_ x; - T_ hundred(100ULL); - if constexpr (std::is_same_v) - { - x = (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); - } - else - { -#if defined(__x86_64__) - x = (T_(_x.items[T::_impl::little(3)]) << 192) + (T_(_x.items[T::_impl::little(2)]) << 128) + - (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); -#else - x = _x; -#endif - } - - auto * pp = p + len; - while (x >= hundred) - { - const auto i = x % hundred; - x /= hundred; - pp -= 2; - outTwoDigits(pp, i); - } - if (x < 10) - *p = '0' + x; - else - outTwoDigits(p, x); - return p + len; -} - -static inline char * writeLeadingMinus(char * pos) -{ - *pos = '-'; - return pos + 1; -} - -template -static inline char * writeSIntText(T x, char * pos) -{ - static_assert(std::is_same_v || std::is_same_v); - - using UnsignedT = make_unsigned_t; - static constexpr T min_int = UnsignedT(1) << (sizeof(T) * 8 - 1); - - if (unlikely(x == min_int)) - { - if constexpr (std::is_same_v) - { - const char * res = "-170141183460469231731687303715884105728"; - memcpy(pos, res, strlen(res)); - return pos + strlen(res); - } - else if constexpr (std::is_same_v) - { - const char * res = "-57896044618658097711785492504343953926634992332820282019728792003956564819968"; - memcpy(pos, res, strlen(res)); - return pos + strlen(res); - } - } - - if (x < 0) - { - x = -x; - pos = writeLeadingMinus(pos); - } - return writeUIntText(UnsignedT(x), pos); -} - -} - -template -char * itoa(I i, char * p) -{ - return impl::convert::itoa(i, p); -} - -template <> -inline char * itoa(char8_t i, char * p) -{ - return impl::convert::itoa(uint8_t(i), p); -} - -template <> -inline char * itoa(UInt128 i, char * p) -{ - return impl::writeUIntText(i, p); -} - -template <> -inline char * itoa(Int128 i, char * p) -{ - return impl::writeSIntText(i, p); -} - -template <> -inline char * itoa(UInt256 i, char * p) -{ - return impl::writeUIntText(i, p); -} - -template <> -inline char * itoa(Int256 i, char * p) -{ - return impl::writeSIntText(i, p); -} +#define DIGITS_INTEGER_TYPES(M) \ + M(uint8_t) \ + M(UInt8) \ + M(UInt16) \ + M(UInt32) \ + M(UInt64) \ + M(UInt128) \ + M(UInt256) +#define INSTANTIATION(T) \ + extern template int digits10(T x); +DIGITS_INTEGER_TYPES(INSTANTIATION) +#undef DIGITS_INTEGER_TYPES +#undef INSTANTIATION diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index ac3e3671ae0..dea369a508a 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -37,6 +37,7 @@ list (APPEND PUBLIC_LIBS clickhouse_dictionaries_embedded clickhouse_parsers ch_contrib::consistent_hashing + common dbms ch_contrib::metrohash ch_contrib::murmurhash From 7fd13df8a5055892d2f8cdc83dcb900c19c87a95 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Thu, 7 Mar 2024 17:09:55 +0100 Subject: [PATCH 0136/1081] check memory limit periodically --- programs/keeper/Keeper.cpp | 10 +++++++++ programs/server/Server.cpp | 1 + src/Common/CgroupsMemoryUsageObserver.cpp | 26 +++++++++++++++++------ src/Common/CgroupsMemoryUsageObserver.h | 7 ++++-- 4 files changed, 35 insertions(+), 9 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 8972c82eab8..76dd8cb15a5 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -623,6 +624,15 @@ try buildLoggers(config(), logger()); main_config_reloader->start(); + std::optional observer; + auto cgroups_memory_observer_wait_time = config().getUInt64("keeper_server.cgroups_memory_observer_wait_time", 1); + if (cgroups_memory_observer_wait_time > 0) + { + observer.emplace(std::chrono::seconds(cgroups_memory_observer_wait_time)); + observer->startThread(); + } + + LOG_INFO(log, "Ready for connections."); waitForTerminationRequest(); diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index c45291ba52c..6b282893dee 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1362,6 +1362,7 @@ try cgroups_memory_usage_observer->setLimits( static_cast(max_server_memory_usage * hard_limit_ratio), static_cast(max_server_memory_usage * soft_limit_ratio)); + cgroups_memory_usage_observer->startThread(); } size_t merges_mutations_memory_usage_soft_limit = new_server_settings.merges_mutations_memory_usage_soft_limit; diff --git a/src/Common/CgroupsMemoryUsageObserver.cpp b/src/Common/CgroupsMemoryUsageObserver.cpp index 9bed6b191e4..5f24c2553b5 100644 --- a/src/Common/CgroupsMemoryUsageObserver.cpp +++ b/src/Common/CgroupsMemoryUsageObserver.cpp @@ -8,7 +8,9 @@ #include #include #include +#include #include +#include #include #include @@ -48,11 +50,10 @@ CgroupsMemoryUsageObserver::~CgroupsMemoryUsageObserver() void CgroupsMemoryUsageObserver::setLimits(uint64_t hard_limit_, uint64_t soft_limit_) { + std::lock_guard lock(set_limit_mutex); if (hard_limit_ == hard_limit && soft_limit_ == soft_limit) return; - stopThread(); - hard_limit = hard_limit_; soft_limit = soft_limit_; @@ -94,8 +95,6 @@ void CgroupsMemoryUsageObserver::setLimits(uint64_t hard_limit_, uint64_t soft_l } }; - startThread(); - LOG_INFO(log, "Set new limits, soft limit: {}, hard limit: {}", ReadableSize(soft_limit_), ReadableSize(hard_limit_)); } @@ -277,7 +276,7 @@ void CgroupsMemoryUsageObserver::stopThread() void CgroupsMemoryUsageObserver::runThread() { setThreadName("CgrpMemUsgObsr"); - + last_memory_amount = getMemoryAmount(); std::unique_lock lock(thread_mutex); while (true) { @@ -286,8 +285,21 @@ void CgroupsMemoryUsageObserver::runThread() try { - uint64_t memory_usage = file.readMemoryUsage(); - processMemoryUsage(memory_usage); + uint64_t memory_limit = getMemoryAmount(); + if (memory_limit != last_memory_amount) + { + last_memory_amount = memory_limit; + /// if we find memory amount changes, we just reload config. + /// Reloading config will check the memory amount again and calculate soft/hard limit again. + auto global_context = getContext()->getGlobalContext(); + global_context->reloadConfig(); + } + std::lock_guard set_limit_lock(set_limit_mutex); + if (soft_limit > 0 && hard_limit > 0) + { + uint64_t memory_usage = file.readMemoryUsage(); + processMemoryUsage(memory_usage); + } } catch (...) { diff --git a/src/Common/CgroupsMemoryUsageObserver.h b/src/Common/CgroupsMemoryUsageObserver.h index 28bf08c82b5..6edf2e2049d 100644 --- a/src/Common/CgroupsMemoryUsageObserver.h +++ b/src/Common/CgroupsMemoryUsageObserver.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include @@ -14,7 +15,7 @@ namespace DB /// - When the soft memory limit is hit, drop jemalloc cache. /// - When the hard memory limit is hit, update MemoryTracking metric to throw memory exceptions faster. #if defined(OS_LINUX) -class CgroupsMemoryUsageObserver +class CgroupsMemoryUsageObserver : public WithContext { public: enum class CgroupsVersion @@ -27,6 +28,7 @@ public: ~CgroupsMemoryUsageObserver(); void setLimits(uint64_t hard_limit_, uint64_t soft_limit_); + void startThread(); size_t getHardLimit() const { return hard_limit; } size_t getSoftLimit() const { return soft_limit; } @@ -64,16 +66,17 @@ private: File file; - void startThread(); void stopThread(); void runThread(); void processMemoryUsage(uint64_t usage); std::mutex thread_mutex; + std::mutex set_limit_mutex; std::condition_variable cond; ThreadFromGlobalPool thread; bool quit = false; + uint64_t last_memory_amount; }; #else From 0f0ea422f21af8e37aa5c8ef58002d608cde5c77 Mon Sep 17 00:00:00 2001 From: serxa Date: Thu, 7 Mar 2024 17:05:54 +0000 Subject: [PATCH 0137/1081] separate limits on number of waiting and executing queries --- .../settings.md | 16 ++- programs/server/Server.cpp | 1 + src/Common/AsyncLoader.cpp | 21 +++- src/Common/AsyncLoader.h | 65 ++++++++++- src/Common/tests/gtest_async_loader.cpp | 66 +++++++++++ src/Core/ServerSettings.h | 1 + src/Interpreters/ProcessList.cpp | 103 +++++++++++++++--- src/Interpreters/ProcessList.h | 41 ++++++- .../System/StorageSystemServerSettings.cpp | 1 + 9 files changed, 293 insertions(+), 22 deletions(-) diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md index 07c9a2b88ab..63fbd9d1964 100644 --- a/docs/en/operations/server-configuration-parameters/settings.md +++ b/docs/en/operations/server-configuration-parameters/settings.md @@ -379,6 +379,18 @@ Type: UInt64 Default: 0 +## max_waiting_queries + +Limit on total number of concurrently waiting queries. Execution of a waiting query is blocked while required tables are loading asynchronously (see `async_load_databases`). Note that waiting queries are not counted when `max_concurrent_queries`, `max_concurrent_insert_queries`, `max_concurrent_select_queries`, `max_concurrent_queries_for_user` and `max_concurrent_queries_for_all_users` limits are checked. This correction is done to avoid hitting these limits just after server startup. Zero means unlimited. + +:::note +This setting can be modified at runtime and will take effect immediately. Queries that are already running will remain unchanged. +::: + +Type: UInt64 + +Default: 0 + ## max_connections Max server connections. @@ -1725,7 +1737,7 @@ Default value: `0.5`. Asynchronous loading of databases and tables. -If `true` all non-system databases with `Ordinary`, `Atomic` and `Replicated` engine will be loaded asynchronously after the ClickHouse server start up. See `system.asynchronous_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a table, that is not yet loaded, will wait for exactly this table to be started up. If load job fails, query will rethrow an error (instead of shutting down the whole server in case of `async_load_databases = false`). The table that is waited for by at least one query will be loaded with higher priority. DDL queries on a database will wait for exactly that database to be started up. +If `true` all non-system databases with `Ordinary`, `Atomic` and `Replicated` engine will be loaded asynchronously after the ClickHouse server start up. See `system.asynchronous_loader` table, `tables_loader_background_pool_size` and `tables_loader_foreground_pool_size` server settings. Any query that tries to access a table, that is not yet loaded, will wait for exactly this table to be started up. If load job fails, query will rethrow an error (instead of shutting down the whole server in case of `async_load_databases = false`). The table that is waited for by at least one query will be loaded with higher priority. DDL queries on a database will wait for exactly that database to be started up. Also consider setting a limit `max_waiting_queries` for the total number of waiting queries. If `false`, all databases are loaded when the server starts. @@ -2926,7 +2938,7 @@ Default: 0 ## ignore_empty_sql_security_in_create_view_query {#ignore_empty_sql_security_in_create_view_query} -If true, ClickHouse doesn't write defaults for empty SQL security statement in CREATE VIEW queries. +If true, ClickHouse doesn't write defaults for empty SQL security statement in CREATE VIEW queries. :::note This setting is only necessary for the migration period and will become obsolete in 24.4 diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index a10f47be0b8..336563665a2 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1429,6 +1429,7 @@ try global_context->getProcessList().setMaxSize(new_server_settings.max_concurrent_queries); global_context->getProcessList().setMaxInsertQueriesAmount(new_server_settings.max_concurrent_insert_queries); global_context->getProcessList().setMaxSelectQueriesAmount(new_server_settings.max_concurrent_select_queries); + global_context->getProcessList().setMaxWaitingQueriesAmount(new_server_settings.max_waiting_queries); if (config->has("keeper_server")) global_context->updateKeeperConfiguration(*config); diff --git a/src/Common/AsyncLoader.cpp b/src/Common/AsyncLoader.cpp index 140194e10b4..80e4c72f1c1 100644 --- a/src/Common/AsyncLoader.cpp +++ b/src/Common/AsyncLoader.cpp @@ -140,6 +140,11 @@ void LoadJob::finish() finish_time = std::chrono::system_clock::now(); if (waiters > 0) finished.notify_all(); + else + { + on_waiters_increment = {}; + on_waiters_decrement = {}; + } } void LoadJob::scheduled(UInt64 job_id_) @@ -765,11 +770,25 @@ void AsyncLoader::wait(std::unique_lock & job_lock, const LoadJobPtr if (job->load_status != LoadStatus::PENDING) // Shortcut just to avoid incrementing ProfileEvents return; + if (job->on_waiters_increment) + job->on_waiters_increment(job); + + // WARNING: it is important not to throw below this point to avoid `on_waiters_increment` call w/o matching `on_waiters_decrement` call + Stopwatch watch; job->waiters++; job->finished.wait(job_lock, [&] { return job->load_status != LoadStatus::PENDING; }); job->waiters--; ProfileEvents::increment(ProfileEvents::AsyncLoaderWaitMicroseconds, watch.elapsedMicroseconds()); + + if (job->on_waiters_decrement) + job->on_waiters_decrement(job); + + if (job->waiters == 0) + { + job->on_waiters_increment = {}; + job->on_waiters_decrement = {}; + } } bool AsyncLoader::canSpawnWorker(Pool & pool, std::unique_lock &) @@ -859,7 +878,7 @@ void AsyncLoader::worker(Pool & pool) try { current_load_job = job.get(); - SCOPE_EXIT({ current_load_job = nullptr; }); // Note that recursive job execution is not supported + SCOPE_EXIT({ current_load_job = nullptr; }); // Note that recursive job execution is not supported, but jobs can wait one another job->execute(*this, pool_id, job); exception_from_job = {}; } diff --git a/src/Common/AsyncLoader.h b/src/Common/AsyncLoader.h index b1b336d24dc..3f81a36aa96 100644 --- a/src/Common/AsyncLoader.h +++ b/src/Common/AsyncLoader.h @@ -59,7 +59,8 @@ enum class LoadStatus class LoadJob : private boost::noncopyable { public: - template + // NOTE: makeLoadJob() helper should be used instead of direct ctor call + template LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, DFFunc && dependency_failure_, Func && func_) : dependencies(std::forward(dependencies_)) , name(std::move(name_)) @@ -69,6 +70,19 @@ public: , func(std::forward(func_)) {} + // NOTE: makeLoadJob() helper should be used instead of direct ctor call + template + LoadJob(LoadJobSetType && dependencies_, String name_, size_t pool_id_, WIFunc && on_waiters_increment_, WDFunc && on_waiters_decrement_, DFFunc && dependency_failure_, Func && func_) + : dependencies(std::forward(dependencies_)) + , name(std::move(name_)) + , execution_pool_id(pool_id_) + , pool_id(pool_id_) + , on_waiters_increment(std::forward(on_waiters_increment_)) + , on_waiters_decrement(std::forward(on_waiters_decrement_)) + , dependency_failure(std::forward(dependency_failure_)) + , func(std::forward(func_)) + {} + // Current job status. LoadStatus status() const; std::exception_ptr exception() const; @@ -112,6 +126,13 @@ private: std::atomic execution_pool_id; std::atomic pool_id; + // Handlers that is called by every new waiting thread, just before going to sleep. + // If `on_waiters_increment` throws, then wait is canceled, and corresponding `on_waiters_decrement` will never be called. + // It can be used for counting and limits on number of waiters. + // Note that implementations are called under `LoadJob::mutex` and should be fast. + std::function on_waiters_increment; + std::function on_waiters_decrement; + // Handler for failed or canceled dependencies. // If job needs to be canceled on `dependency` failure, then function should set `cancel` to a specific reason. // Note that implementation should be fast and cannot use AsyncLoader, because it is called under `AsyncLoader::mutex`. @@ -140,8 +161,50 @@ void cancelOnDependencyFailure(const LoadJobPtr & self, const LoadJobPtr & depen void ignoreDependencyFailure(const LoadJobPtr & self, const LoadJobPtr & dependency, std::exception_ptr & cancel); template concept LoadJobDependencyFailure = std::invocable; +template concept LoadJobOnWaiters = std::invocable; template concept LoadJobFunc = std::invocable; +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) +{ + return std::make_shared(std::move(dependencies), std::move(name), 0, on_waiters_increment, on_waiters_decrement, std::forward(dependency_failure), std::forward(func)); +} + +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) +{ + return std::make_shared(dependencies, std::move(name), 0, on_waiters_increment, on_waiters_decrement, std::forward(dependency_failure), std::forward(func)); +} + +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) +{ + return std::make_shared(std::move(dependencies), std::move(name), pool_id, on_waiters_increment, on_waiters_decrement, std::forward(dependency_failure), std::forward(func)); +} + +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) +{ + return std::make_shared(dependencies, std::move(name), pool_id, on_waiters_increment, on_waiters_decrement, std::forward(dependency_failure), std::forward(func)); +} + +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobFunc auto && func) +{ + return std::make_shared(std::move(dependencies), std::move(name), 0, on_waiters_increment, on_waiters_decrement, cancelOnDependencyFailure, std::forward(func)); +} + +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobFunc auto && func) +{ + return std::make_shared(dependencies, std::move(name), 0, on_waiters_increment, on_waiters_decrement, cancelOnDependencyFailure, std::forward(func)); +} + +LoadJobPtr makeLoadJob(LoadJobSet && dependencies, size_t pool_id, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobFunc auto && func) +{ + return std::make_shared(std::move(dependencies), std::move(name), pool_id, on_waiters_increment, on_waiters_decrement, cancelOnDependencyFailure, std::forward(func)); +} + +LoadJobPtr makeLoadJob(const LoadJobSet & dependencies, size_t pool_id, String name, LoadJobOnWaiters auto && on_waiters_increment, LoadJobOnWaiters auto && on_waiters_decrement, LoadJobFunc auto && func) +{ + return std::make_shared(dependencies, std::move(name), pool_id, on_waiters_increment, on_waiters_decrement, cancelOnDependencyFailure, std::forward(func)); +} + + LoadJobPtr makeLoadJob(LoadJobSet && dependencies, String name, LoadJobDependencyFailure auto && dependency_failure, LoadJobFunc auto && func) { return std::make_shared(std::move(dependencies), std::move(name), 0, std::forward(dependency_failure), std::forward(func)); diff --git a/src/Common/tests/gtest_async_loader.cpp b/src/Common/tests/gtest_async_loader.cpp index fc2537abcfc..62a27f259cc 100644 --- a/src/Common/tests/gtest_async_loader.cpp +++ b/src/Common/tests/gtest_async_loader.cpp @@ -643,6 +643,72 @@ TEST(AsyncLoader, CustomDependencyFailure) ASSERT_EQ(good_count.load(), 3); } +TEST(AsyncLoader, WaitersLimit) +{ + AsyncLoaderTest t(16); + + std::atomic waiters_total{0}; + int waiters_limit = 5; + auto waiters_inc = [&] (const LoadJobPtr &) { + int value = waiters_total.load(); + while (true) + { + if (value >= waiters_limit) + throw Exception(ErrorCodes::ASYNC_LOAD_FAILED, "Too many waiters: {}", value); + if (waiters_total.compare_exchange_strong(value, value + 1)) + break; + } + }; + auto waiters_dec = [&] (const LoadJobPtr &) { + waiters_total.fetch_sub(1); + }; + + std::barrier sync(2); + t.loader.start(); + + auto job_func = [&] (AsyncLoader &, const LoadJobPtr &) { + sync.arrive_and_wait(); // (A) + }; + + auto job = makeLoadJob({}, "job", waiters_inc, waiters_dec, job_func); + auto task = t.schedule({job}); + + std::atomic failure{0}; + std::atomic success{0}; + std::vector waiters; + waiters.reserve(10); + auto waiter = [&] { + try + { + t.loader.wait(job); + success.fetch_add(1); + } + catch(...) + { + failure.fetch_add(1); + } + }; + + for (int i = 0; i < 10; i++) + waiters.emplace_back(waiter); + + while (failure.load() != 5) + std::this_thread::yield(); + + ASSERT_EQ(job->waitersCount(), 5); + + sync.arrive_and_wait(); // (A) + + for (auto & thread : waiters) + thread.join(); + + ASSERT_EQ(success.load(), 5); + ASSERT_EQ(failure.load(), 5); + ASSERT_EQ(waiters_total.load(), 0); + + t.loader.wait(); +} + TEST(AsyncLoader, TestConcurrency) { AsyncLoaderTest t(10); diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h index c82255ec59c..129b1016fca 100644 --- a/src/Core/ServerSettings.h +++ b/src/Core/ServerSettings.h @@ -64,6 +64,7 @@ namespace DB M(UInt64, max_concurrent_queries, 0, "Maximum number of concurrently executed queries. Zero means unlimited.", 0) \ M(UInt64, max_concurrent_insert_queries, 0, "Maximum number of concurrently INSERT queries. Zero means unlimited.", 0) \ M(UInt64, max_concurrent_select_queries, 0, "Maximum number of concurrently SELECT queries. Zero means unlimited.", 0) \ + M(UInt64, max_waiting_queries, 0, "Maximum number of concurrently waiting queries blocked due to `async_load_databases`. Note that waiting queries are not considered by `max_concurrent_*queries*` limits. Zero means unlimited.", 0) \ \ M(Double, cache_size_to_ram_max_ratio, 0.5, "Set cache size ro RAM max ratio. Allows to lower cache size on low-memory systems.", 0) \ M(String, uncompressed_cache_policy, DEFAULT_UNCOMPRESSED_CACHE_POLICY, "Uncompressed cache policy name.", 0) \ diff --git a/src/Interpreters/ProcessList.cpp b/src/Interpreters/ProcessList.cpp index 3bd7b2d4206..f451d561e60 100644 --- a/src/Interpreters/ProcessList.cpp +++ b/src/Interpreters/ProcessList.cpp @@ -83,25 +83,31 @@ ProcessList::insert(const String & query_, const IAST * ast, ContextMutablePtr q IAST::QueryKind query_kind = ast->getQueryKind(); const auto queue_max_wait_ms = settings.queue_max_wait_ms.totalMilliseconds(); - if (!is_unlimited_query && max_size && processes.size() >= max_size) + UInt64 waiting_queries = waiting_queries_amount.load(); + if (!is_unlimited_query && max_size && processes.size() >= max_size + waiting_queries) { if (queue_max_wait_ms) LOG_WARNING(getLogger("ProcessList"), "Too many simultaneous queries, will wait {} ms.", queue_max_wait_ms); - if (!queue_max_wait_ms || !have_space.wait_for(lock, std::chrono::milliseconds(queue_max_wait_ms), [&]{ return processes.size() < max_size; })) - throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, "Too many simultaneous queries. Maximum: {}", max_size); + if (!queue_max_wait_ms || !have_space.wait_for(lock, std::chrono::milliseconds(queue_max_wait_ms), + [&]{ waiting_queries = waiting_queries_amount.load(); return processes.size() < max_size + waiting_queries; })) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous queries. Maximum: {}{}", + max_size, waiting_queries == 0 ? "" : fmt::format(", waiting: {}", waiting_queries)); } if (!is_unlimited_query) { QueryAmount amount = getQueryKindAmount(query_kind); - if (max_insert_queries_amount && query_kind == IAST::QueryKind::Insert && amount >= max_insert_queries_amount) + UInt64 waiting_inserts = waiting_insert_queries_amount.load(); + UInt64 waiting_selects = waiting_select_queries_amount.load(); + if (max_insert_queries_amount && query_kind == IAST::QueryKind::Insert && amount >= max_insert_queries_amount + waiting_inserts) throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, - "Too many simultaneous insert queries. Maximum: {}, current: {}", - max_insert_queries_amount, amount); - if (max_select_queries_amount && query_kind == IAST::QueryKind::Select && amount >= max_select_queries_amount) + "Too many simultaneous insert queries. Maximum: {}, current: {}{}", + max_insert_queries_amount, amount, waiting_inserts == 0 ? "" : fmt::format(", waiting: {}", waiting_inserts)); + if (max_select_queries_amount && query_kind == IAST::QueryKind::Select && amount >= max_select_queries_amount + waiting_selects) throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, - "Too many simultaneous select queries. Maximum: {}, current: {}", - max_select_queries_amount, amount); + "Too many simultaneous select queries. Maximum: {}, current: {}{}", + max_select_queries_amount, amount, waiting_selects == 0 ? "" : fmt::format(", waiting: {}", waiting_selects)); } { @@ -124,10 +130,12 @@ ProcessList::insert(const String & query_, const IAST * ast, ContextMutablePtr q * once is already processing 50+ concurrent queries (including analysts or any other users). */ + waiting_queries = waiting_queries_amount.load(); if (!is_unlimited_query && settings.max_concurrent_queries_for_all_users - && processes.size() >= settings.max_concurrent_queries_for_all_users) + && processes.size() >= settings.max_concurrent_queries_for_all_users + waiting_queries_amount) throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, "Too many simultaneous queries for all users. " - "Current: {}, maximum: {}", processes.size(), settings.max_concurrent_queries_for_all_users.toString()); + "Current: {}, maximum: {}{}", processes.size(), settings.max_concurrent_queries_for_all_users.toString(), + waiting_queries == 0 ? "" : fmt::format(", waiting: {}", waiting_queries)); } /** Why we use current user? @@ -145,13 +153,15 @@ ProcessList::insert(const String & query_, const IAST * ast, ContextMutablePtr q if (user_process_list != user_to_queries.end()) { + UInt64 user_waiting_queries = user_process_list->second.waiting_queries_amount.load(); if (!is_unlimited_query && settings.max_concurrent_queries_for_user - && user_process_list->second.queries.size() >= settings.max_concurrent_queries_for_user) + && user_process_list->second.queries.size() >= settings.max_concurrent_queries_for_user + user_waiting_queries) throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, "Too many simultaneous queries for user {}. " - "Current: {}, maximum: {}", + "Current: {}, maximum: {}{}", client_info.current_user, user_process_list->second.queries.size(), - settings.max_concurrent_queries_for_user.toString()); + settings.max_concurrent_queries_for_user.toString(), + user_waiting_queries == 0 ? "" : fmt::format(", waiting: {}", user_waiting_queries)); auto running_query = user_process_list->second.queries.find(client_info.current_query_id); @@ -745,4 +755,69 @@ ProcessList::QueryAmount ProcessList::getQueryKindAmount(const IAST::QueryKind & return found->second; } +void ProcessList::increaseWaitingQueryAmount(const QueryStatusPtr & status) +{ + UInt64 limit = max_waiting_queries_amount.load(); + UInt64 value = waiting_queries_amount.load(); + while (true) + { + if (value >= limit) + throw Exception(ErrorCodes::TOO_MANY_SIMULTANEOUS_QUERIES, + "Too many simultaneous waiting queries. Maximum: {}, waiting: {}", + limit, value); + if (waiting_queries_amount.compare_exchange_strong(value, value + 1)) + break; + } + + // WARNING: After this point we should not throw, otherwise corresponding `decreaseWaitingQueryAmount` will not be called. + + // Update query kind counters + if (status->query_kind == IAST::QueryKind::Insert) + waiting_insert_queries_amount.fetch_add(1); + if (status->query_kind == IAST::QueryKind::Select) + waiting_select_queries_amount.fetch_add(1); + + // Update per-user counter + status->getUserProcessList()->waiting_queries_amount.fetch_add(1); + + // We have to notify because some queries might be waiting on `have_space` + // and this query leaves its space by transitioning to waiting state + have_space.notify_all(); +} + +void ProcessList::decreaseWaitingQueryAmount(const QueryStatusPtr & status) +{ + if (status->getUserProcessList()->waiting_queries_amount.fetch_sub(1) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong insert waiting query amount for user: decrease to negative"); + + if (status->query_kind == IAST::QueryKind::Insert && waiting_insert_queries_amount.fetch_sub(1) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong insert waiting query amount: decrease to negative"); + + if (status->query_kind == IAST::QueryKind::Select && waiting_select_queries_amount.fetch_sub(1) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong select waiting query amount: decrease to negative"); + + if (waiting_queries_amount.fetch_sub(1) == 0) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong waiting query amount: decrease to negative"); +} + +void ProcessList::incrementWaiters() +{ + ContextPtr context = CurrentThread::getQueryContext(); + QueryStatusPtr status = context->getProcessListElement(); + + // Query became "waiting" with the first thread that waits + if (status->waiting_threads.fetch_add(1) == 0) + increaseWaitingQueryAmount(status); +} + +void ProcessList::decrementWaiters() +{ + ContextPtr context = CurrentThread::getQueryContext(); + QueryStatusPtr status = context->getProcessListElement(); + + // Query became "non-waiting" with the last thread that no longer waits + if (status->waiting_threads.fetch_sub(1) == 1) + decreaseWaitingQueryAmount(status); +} + } diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 1c253f562e8..75350627698 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -42,10 +42,6 @@ class ThreadStatus; class ProcessListEntry; -/** List of currently executing queries. - * Also implements limit on their number. - */ - /** Information of process list element. * To output in SHOW PROCESSLIST query. Does not contain any complex objects, that do something on copy or destructor. */ @@ -114,8 +110,13 @@ protected: /// Including EndOfStream or Exception. std::atomic is_all_data_sent { false }; + /// Number of threads for the query that are waiting for load jobs + std::atomic waiting_threads{0}; + + /// For initialization of ProcessListForUser during process insertion. void setUserProcessList(ProcessListForUser * user_process_list_); /// Be careful using it. For example, queries field of ProcessListForUser could be modified concurrently. + ProcessListForUser * getUserProcessList() { return user_process_list; } const ProcessListForUser * getUserProcessList() const { return user_process_list; } /// Sets an entry in the ProcessList associated with this QueryStatus. @@ -283,6 +284,9 @@ struct ProcessListForUser /// Count network usage for all simultaneously running queries of single user. ThrottlerPtr user_throttler; + /// Number of queries waiting on load jobs + std::atomic waiting_queries_amount{0}; + ProcessListForUserInfo getInfo(bool get_profile_events = false) const; /// Clears MemoryTracker for the user. @@ -341,6 +345,9 @@ protected: }; +/** List of currently executing queries. + * Also implements limit on their number. + */ class ProcessList : public ProcessListBase { public: @@ -399,10 +406,21 @@ protected: /// amount of queries by query kind. QueryKindAmounts query_kind_amounts; + /// limit for waiting queries. 0 means no limit. Otherwise, when limit exceeded, an exception is thrown. + std::atomic max_waiting_queries_amount{0}; + + /// amounts of waiting queries + std::atomic waiting_queries_amount{0}; + std::atomic waiting_insert_queries_amount{0}; + std::atomic waiting_select_queries_amount{0}; + void increaseQueryKindAmount(const IAST::QueryKind & query_kind); void decreaseQueryKindAmount(const IAST::QueryKind & query_kind); QueryAmount getQueryKindAmount(const IAST::QueryKind & query_kind) const; + void increaseWaitingQueryAmount(const QueryStatusPtr & status); + void decreaseWaitingQueryAmount(const QueryStatusPtr & status); + public: using EntryPtr = std::shared_ptr; @@ -458,6 +476,21 @@ public: return max_select_queries_amount; } + void setMaxWaitingQueriesAmount(UInt64 max_waiting_queries_amount_) + { + max_waiting_queries_amount.store(max_waiting_queries_amount_); + // NOTE: We cannot cancel waiting queries when limit is lowered. They have to wait anyways, but new queries will be canceled instead of waiting. + } + + size_t getMaxWaitingQueriesAmount() const + { + return max_waiting_queries_amount.load(); + } + + // Handlers for AsyncLoader waiters + void incrementWaiters(); + void decrementWaiters(); + /// Try call cancel() for input and output streams of query with specified id and user CancellationCode sendCancelToQuery(const String & current_query_id, const String & current_user, bool kill = false); CancellationCode sendCancelToQuery(QueryStatusPtr elem, bool kill = false); diff --git a/src/Storages/System/StorageSystemServerSettings.cpp b/src/Storages/System/StorageSystemServerSettings.cpp index f390985546b..bf14f757a19 100644 --- a/src/Storages/System/StorageSystemServerSettings.cpp +++ b/src/Storages/System/StorageSystemServerSettings.cpp @@ -70,6 +70,7 @@ void StorageSystemServerSettings::fillData(MutableColumns & res_columns, Context {"max_concurrent_queries", {std::to_string(context->getProcessList().getMaxSize()), ChangeableWithoutRestart::Yes}}, {"max_concurrent_insert_queries", {std::to_string(context->getProcessList().getMaxInsertQueriesAmount()), ChangeableWithoutRestart::Yes}}, {"max_concurrent_select_queries", {std::to_string(context->getProcessList().getMaxSelectQueriesAmount()), ChangeableWithoutRestart::Yes}}, + {"max_waiting_queries", {std::to_string(context->getProcessList().getMaxWaitingQueriesAmount()), ChangeableWithoutRestart::Yes}}, {"background_buffer_flush_schedule_pool_size", {std::to_string(CurrentMetrics::get(CurrentMetrics::BackgroundBufferFlushSchedulePoolSize)), ChangeableWithoutRestart::IncreaseOnly}}, {"background_schedule_pool_size", {std::to_string(CurrentMetrics::get(CurrentMetrics::BackgroundSchedulePoolSize)), ChangeableWithoutRestart::IncreaseOnly}}, From f4fc65449cc3ace36f33323600fd1a47fbfb9736 Mon Sep 17 00:00:00 2001 From: Peter Date: Fri, 8 Mar 2024 01:20:50 +0800 Subject: [PATCH 0138/1081] Add another example dataset for presenting usage --- .../example-datasets/tw-weather.md | 293 ++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 docs/en/getting-started/example-datasets/tw-weather.md diff --git a/docs/en/getting-started/example-datasets/tw-weather.md b/docs/en/getting-started/example-datasets/tw-weather.md new file mode 100644 index 00000000000..e5f16c403d5 --- /dev/null +++ b/docs/en/getting-started/example-datasets/tw-weather.md @@ -0,0 +1,293 @@ +--- +slug: /en/getting-started/example-datasets/tw-weather +sidebar_label: Taiwan Historical Weather Datasets +sidebar_position: 1 +description: 131 million rows of weather observation data for the last 128 yrs +--- + +# Taiwan Historical Weather Datasets + +This dataset contains historical meteorological observations measurements for the last 128 years. Each row is a measurement for a point in date time and weather station. + +The origin of this dataset is available [here](https://github.com/Raingel/historical_weather) and the list of weather station numbers can be found [here](https://github.com/Raingel/weather_station_list). + +> The sources of meteorological datasets include the meteorological stations that are established by the Central Weather Administration (station code is beginning with C0, C1, and 4) and the agricultural meteorological stations belonging to the Council of Agriculture (station code other than those mentioned above): + + - StationId + - MeasuredDate, the observation time + - StnPres, the station air pressure + - SeaPres, the sea level pressure + - Td, the dew point temperature + - RH, the relative humidity + - Other elements where available + +## Downloading the data + +- A [pre-processed version](#pre-processed-data) of the data for the ClickHouse, which has been cleaned, re-structured, and enriched. This dataset covers the years from 1896 to 2023. +- [Download the original raw data](#original-raw-data) and convert to the format required by ClickHouse. Users wanting to add their own columns may wish to explore or complete their approaches. + +### Pre-processed data + +The dataset has also been re-structured from a measurement per line to a row per weather station id and measured date, i.e. + +```csv +StationId,MeasuredDate,StnPres,Tx,RH,WS,WD,WSGust,WDGust,Precp,GloblRad,TxSoil0cm,TxSoil5cm,TxSoil20cm,TxSoil50cm,TxSoil100cm,SeaPres,Td,PrecpHour,SunShine,TxSoil10cm,EvapA,Visb,UVI,Cloud Amount,TxSoil30cm,TxSoil200cm,TxSoil300cm,TxSoil500cm,VaporPressure +C0X100,2016-01-01 01:00:00,1022.1,16.1,72,1.1,8.0,,,,,,,,,,,,,,,,,,,,,,, +C0X100,2016-01-01 02:00:00,1021.6,16.0,73,1.2,358.0,,,,,,,,,,,,,,,,,,,,,,, +C0X100,2016-01-01 03:00:00,1021.3,15.8,74,1.5,353.0,,,,,,,,,,,,,,,,,,,,,,, +C0X100,2016-01-01 04:00:00,1021.2,15.8,74,1.7,8.0,,,,,,,,,,,,,,,,,,,,,,, +``` + +It is easy to query and ensure that the resulting table has less sparse and some elements are null because they're not available to be measured in this weather station. + +This dataset is available in the following Google CloudStorage location. Either download the dataset to your local filesystem (and insert them with the ClickHouse client) or insert them directly into the ClickHouse (see [Inserting from URL](#inserting-from-url)). + +To download: + +```bash +wget https://storage.googleapis.com/taiwan-weather-observaiton-datasets/preprocessed_weather_daily_1896_2023.tar.gz + +# Option: Validate the checksum +md5sum preprocessed_weather_daily_1896_2023.tar.gz +# Checksum should be equal to: 11b484f5bd9ddafec5cfb131eb2dd008 + +tar -xzvf preprocessed_weather_daily_1896_2023.tar.gz +daily_weather_preprocessed_1896_2023.csv + +# Option: Validate the checksum +md5sum daily_weather_preprocessed_1896_2023.csv +# Checksum should be equal to: 1132248c78195c43d93f843753881754 +``` + +### Original raw data + +The following details are about the steps to download the original raw data to transform and convert as you want. + +#### Download + +To download the original raw data: + +```bash +mkdir tw_raw_weather_data && cd tw_raw_weather_data + +wget https://storage.googleapis.com/taiwan-weather-observaiton-datasets/raw_data_weather_daily_1896_2023.tar.gz + +# Option: Validate the checksum +md5sum raw_data_weather_daily_1896_2023.tar.gz +# Checksum should be equal to: b66b9f137217454d655e3004d7d1b51a + +tar -xzvf raw_data_weather_daily_1896_2023.tar.gz +466920_1928.csv +466920_1929.csv +466920_1930.csv +466920_1931.csv +... + +# Option: Validate the checksum +cat *.csv | md5sum +# Checksum should be equal to: b26db404bf84d4063fac42e576464ce1 +``` + +#### Retrieve the Taiwan weather stations + +```bash +wget -O weather_sta_list.csv https://github.com/Raingel/weather_station_list/raw/main/data/weather_sta_list.csv + +# Option: Convert the UTF-8-BOM to UTF-8 encoding +sed -i '1s/^\xEF\xBB\xBF//' weather_sta_list.csv +``` + +## Create table schema + +Create the MergeTree table in ClickHouse (from the ClickHouse client). + +```bash +CREATE TABLE tw_weather_data ( + StationId String null, + MeasuredDate DateTime64, + StnPres Float64 null, + SeaPres Float64 null, + Tx Float64 null, + Td Float64 null, + RH Float64 null, + WS Float64 null, + WD Float64 null, + WSGust Float64 null, + WDGust Float64 null, + Precp Float64 null, + PrecpHour Float64 null, + SunShine Float64 null, + GloblRad Float64 null, + TxSoil0cm Float64 null, + TxSoil5cm Float64 null, + TxSoil10cm Float64 null, + TxSoil20cm Float64 null, + TxSoil50cm Float64 null, + TxSoil100cm Float64 null, + TxSoil30cm Float64 null, + TxSoil200cm Float64 null, + TxSoil300cm Float64 null, + TxSoil500cm Float64 null, + VaporPressure Float64 null, + UVI Float64 null, + "Cloud Amount" Float64 null, + EvapA Float64 null, + Visb Float64 null +) +ENGINE = MergeTree +ORDER BY (MeasuredDate); +``` + +## Inserting into ClickHouse + +### Inserting from local file + +Data can be inserted from a local file as follows (from the ClickHouse client): + +```sql +INSERT INTO tw_weather_data FROM INFILE '/path/to/daily_weather_preprocessed_1896_2023.csv' +``` + +where `/path/to` represents the specific user path to the local file on the disk. + +And the sample response output is as follows after inserting data into the ClickHouse: + +```response +Query id: 90e4b524-6e14-4855-817c-7e6f98fbeabb + +Ok. +131985329 rows in set. Elapsed: 71.770 sec. Processed 131.99 million rows, 10.06 GB (1.84 million rows/s., 140.14 MB/s.) +Peak memory usage: 583.23 MiB. +``` + +### Inserting from URL + +```sql +INSERT INTO tw_weather_data SELECT * +FROM url('https://storage.googleapis.com/taiwan-weather-observaiton-datasets/daily_weather_preprocessed_1896_2023.csv', 'CSVWithNames') + +``` +To know how to speed this up, please see our blog post on [tuning large data loads](https://clickhouse.com/blog/supercharge-your-clickhouse-data-loads-part2). + +## Check data rows and sizes + +1. Let's see how many rows are inserted: + +```sql +SELECT formatReadableQuantity(count()) +FROM tw_weather_data; +``` + +```response +┌─formatReadableQuantity(count())─┐ +│ 131.99 million │ +└─────────────────────────────────┘ +``` + +2. Let's see how much disk space are used for this table: + +```sql +SELECT + formatReadableSize(sum(bytes)) AS disk_size, + formatReadableSize(sum(data_uncompressed_bytes)) AS uncompressed_size +FROM system.parts +WHERE (`table` = 'tw_weather_data') AND active +``` + +```response +┌─disk_size─┬─uncompressed_size─┐ +│ 2.13 GiB │ 32.94 GiB │ +└───────────┴───────────────────┘ +``` + +## Sample queries + +### Q1: Retrieve the highest dew point temperature for each weather station in the specific year + +```sql +SELECT + StationId, + max(Td) AS max_td +FROM tw_weather_data +WHERE (year(MeasuredDate) = 2023) AND (Td IS NOT NULL) +GROUP BY StationId + +┌─StationId─┬─max_td─┐ +│ 466940 │ 1 │ +│ 467300 │ 1 │ +│ 467540 │ 1 │ +│ 467490 │ 1 │ +│ 467080 │ 1 │ +│ 466910 │ 1 │ +│ 467660 │ 1 │ +│ 467270 │ 1 │ +│ 467350 │ 1 │ +│ 467571 │ 1 │ +│ 466920 │ 1 │ +│ 467650 │ 1 │ +│ 467550 │ 1 │ +│ 467480 │ 1 │ +│ 467610 │ 1 │ +│ 467050 │ 1 │ +│ 467590 │ 1 │ +│ 466990 │ 1 │ +│ 467060 │ 1 │ +│ 466950 │ 1 │ +│ 467620 │ 1 │ +│ 467990 │ 1 │ +│ 466930 │ 1 │ +│ 467110 │ 1 │ +│ 466881 │ 1 │ +│ 467410 │ 1 │ +│ 467441 │ 1 │ +│ 467420 │ 1 │ +│ 467530 │ 1 │ +│ 466900 │ 1 │ +└───────────┴────────┘ + +30 rows in set. Elapsed: 0.045 sec. Processed 6.41 million rows, 187.33 MB (143.92 million rows/s., 4.21 GB/s.) +``` + +### Q2: Raw data fetching with the specific duration time range, fields and weather station + +```sql +SELECT + StnPres, + SeaPres, + Tx, + Td, + RH, + WS, + WD, + WSGust, + WDGust, + Precp, + PrecpHour +FROM tw_weather_data +WHERE (StationId = 'C0UB10') AND (MeasuredDate >= '2023-12-23') AND (MeasuredDate < '2023-12-24') +ORDER BY MeasuredDate ASC +LIMIT 10 +``` + +```response +┌─StnPres─┬─SeaPres─┬───Tx─┬───Td─┬─RH─┬──WS─┬──WD─┬─WSGust─┬─WDGust─┬─Precp─┬─PrecpHour─┐ +│ 1029.5 │ ᴺᵁᴸᴸ │ 11.8 │ ᴺᵁᴸᴸ │ 78 │ 2.7 │ 271 │ 5.5 │ 275 │ -99.8 │ -99.8 │ +│ 1029.8 │ ᴺᵁᴸᴸ │ 12.3 │ ᴺᵁᴸᴸ │ 78 │ 2.7 │ 289 │ 5.5 │ 308 │ -99.8 │ -99.8 │ +│ 1028.6 │ ᴺᵁᴸᴸ │ 12.3 │ ᴺᵁᴸᴸ │ 79 │ 2.3 │ 251 │ 6.1 │ 289 │ -99.8 │ -99.8 │ +│ 1028.2 │ ᴺᵁᴸᴸ │ 13 │ ᴺᵁᴸᴸ │ 75 │ 4.3 │ 312 │ 7.5 │ 316 │ -99.8 │ -99.8 │ +│ 1027.8 │ ᴺᵁᴸᴸ │ 11.1 │ ᴺᵁᴸᴸ │ 89 │ 7.1 │ 310 │ 11.6 │ 322 │ -99.8 │ -99.8 │ +│ 1027.8 │ ᴺᵁᴸᴸ │ 11.6 │ ᴺᵁᴸᴸ │ 90 │ 3.1 │ 269 │ 10.7 │ 295 │ -99.8 │ -99.8 │ +│ 1027.9 │ ᴺᵁᴸᴸ │ 12.3 │ ᴺᵁᴸᴸ │ 89 │ 4.7 │ 296 │ 8.1 │ 310 │ -99.8 │ -99.8 │ +│ 1028.2 │ ᴺᵁᴸᴸ │ 12.2 │ ᴺᵁᴸᴸ │ 94 │ 2.5 │ 246 │ 7.1 │ 283 │ -99.8 │ -99.8 │ +│ 1028.4 │ ᴺᵁᴸᴸ │ 12.5 │ ᴺᵁᴸᴸ │ 94 │ 3.1 │ 265 │ 4.8 │ 297 │ -99.8 │ -99.8 │ +│ 1028.3 │ ᴺᵁᴸᴸ │ 13.6 │ ᴺᵁᴸᴸ │ 91 │ 1.2 │ 273 │ 4.4 │ 256 │ -99.8 │ -99.8 │ +└─────────┴─────────┴──────┴──────┴────┴─────┴─────┴────────┴────────┴───────┴───────────┘ + +10 rows in set. Elapsed: 0.009 sec. Processed 91.70 thousand rows, 2.33 MB (9.67 million rows/s., 245.31 MB/s.) +``` + +## Credits + +We would like to acknowledge the efforts of the Central Weather Administration and Agricultural Meteorological Observation Network (Station) of the Council of Agriculture for preparing, cleaning, and distributing this dataset. We appreciate your efforts. + +Ou, J.-H., Kuo, C.-H., Wu, Y.-F., Lin, G.-C., Lee, M.-H., Chen, R.-K., Chou, H.-P., Wu, H.-Y., Chu, S.-C., Lai, Q.-J., Tsai, Y.-C., Lin, C.-C., Kuo, C.-C., Liao, C.-T., Chen, Y.-N., Chu, Y.-W., Chen, C.-Y., 2023. Application-oriented deep learning model for early warning of rice blast in Taiwan. Ecological Informatics 73, 101950. https://doi.org/10.1016/j.ecoinf.2022.101950 [13/12/2022] From 38cbc2c6c40541cc916bc591cd68b7eef70b1162 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 7 Mar 2024 18:57:16 +0100 Subject: [PATCH 0139/1081] Restore digits --- base/base/itoa.cpp | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index 9fefc9f0f07..ef844ff68a8 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -34,42 +34,37 @@ template -int digits10(T x) +inline int digits10(T x) { - if (x < T(10ULL)) + if (x < 10ULL) return 1; - if (x < T(100ULL)) + if (x < 100ULL) return 2; - if constexpr (sizeof(T) == 1) + if (x < 1000ULL) return 3; - else + + if (x < 1000000000000ULL) { - if (x < T(1000ULL)) - return 3; - - if (x < T(1000000000000ULL)) + if (x < 100000000ULL) { - if (x < T(100000000ULL)) + if (x < 1000000ULL) { - if (x < T(1000000ULL)) - { - if (x < T(10000ULL)) - return 4; - else - return 5 + (x >= T(100000ULL)); - } - - return 7 + (x >= T(10000000ULL)); + if (x < 10000ULL) + return 4; + else + return 5 + (x >= 100000ULL); } - if (x < T(10000000000ULL)) - return 9 + (x >= T(1000000000ULL)); - - return 11 + (x >= T(100000000000ULL)); + return 7 + (x >= 10000000ULL); } - return 12 + digits10(x / T(1000000000000ULL)); + if (x < 10000000000ULL) + return 9 + (x >= 1000000000ULL); + + return 11 + (x >= 100000000000ULL); } + + return 12 + digits10(x / 1000000000000ULL); } From 444595ac576438c9d0a259debf776187ddd3fcce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Thu, 7 Mar 2024 20:00:07 +0100 Subject: [PATCH 0140/1081] More speed please Mr. compiler --- base/base/itoa.cpp | 165 +++++++++++++++++++++++++++++++-------------- 1 file changed, 114 insertions(+), 51 deletions(-) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index ef844ff68a8..08912edf3ea 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -72,7 +72,7 @@ namespace { template -static constexpr T pow10(size_t x) +ALWAYS_INLINE inline constexpr T pow10(size_t x) { return x ? 10 * pow10(x - 1) : 1; } @@ -143,7 +143,7 @@ struct QuotientAndRemainder }; template -QuotientAndRemainder static inline split(UnsignedOfSize value) +QuotientAndRemainder inline split(UnsignedOfSize value) { constexpr DivisionBy10PowN division; @@ -154,7 +154,7 @@ QuotientAndRemainder static inline split(UnsignedOfSize value) } -static inline char * outDigit(char * p, uint8_t value) +ALWAYS_INLINE inline char * outDigit(char * p, uint8_t value) { *p = '0' + value; ++p; @@ -176,7 +176,7 @@ static const char digits[201] = "00010203040506070809" "80818283848586878889" "90919293949596979899"; -static inline char * outTwoDigits(char * p, uint8_t value) +ALWAYS_INLINE inline char * outTwoDigits(char * p, uint8_t value) { memcpy(p, &digits[value * 2], 2); p += 2; @@ -187,9 +187,9 @@ static inline char * outTwoDigits(char * p, uint8_t value) namespace convert { template -static char * head(char * p, UInt u); +char * head(char * p, UInt u); template -static char * tail(char * p, UInt u); +char * tail(char * p, UInt u); //===----------------------------------------------------------===// // head: find most significant digit, skip leading zeros @@ -198,7 +198,7 @@ static char * tail(char * p, UInt u); // "x" contains quotient and remainder after division by 10^N // quotient is less than 10^N template -static inline char * head(char * p, QuotientAndRemainder x) +ALWAYS_INLINE inline char * head(char * p, QuotientAndRemainder x) { p = head(p, UnsignedOfSize(x.quotient)); p = tail(p, x.remainder); @@ -207,14 +207,14 @@ static inline char * head(char * p, QuotientAndRemainder x) // "u" is less than 10^2*N template -static inline char * head(char * p, UInt u) +ALWAYS_INLINE inline char * head(char * p, UInt u) { return u < pow10>(N) ? head(p, UnsignedOfSize(u)) : head(p, split(u)); } // recursion base case, selected when "u" is one byte template <> -inline char * head, 1>(char * p, UnsignedOfSize<1> u) +ALWAYS_INLINE inline char * head, 1>(char * p, UnsignedOfSize<1> u) { return u < 10 ? outDigit(p, u) : outTwoDigits(p, u); } @@ -225,7 +225,7 @@ inline char * head, 1>(char * p, UnsignedOfSize<1> u) // recursive step, "u" is less than 10^2*N template -static inline char * tail(char * p, UInt u) +ALWAYS_INLINE inline char * tail(char * p, UInt u) { QuotientAndRemainder x = split(u); p = tail(p, UnsignedOfSize(x.quotient)); @@ -235,7 +235,7 @@ static inline char * tail(char * p, UInt u) // recursion base case, selected when "u" is one byte template <> -inline char * tail, 1>(char * p, UnsignedOfSize<1> u) +ALWAYS_INLINE inline char * tail, 1>(char * p, UnsignedOfSize<1> u) { return outTwoDigits(p, u); } @@ -244,9 +244,8 @@ inline char * tail, 1>(char * p, UnsignedOfSize<1> u) // large values are >= 10^2*N // where x contains quotient and remainder after division by 10^N //===----------------------------------------------------------===// - template -static inline char * large(char * p, QuotientAndRemainder x) +ALWAYS_INLINE inline char * large(char * p, QuotientAndRemainder x) { QuotientAndRemainder y = split(x.quotient); p = head(p, UnsignedOfSize(y.quotient)); @@ -259,9 +258,8 @@ static inline char * large(char * p, QuotientAndRemainder x) // handle values of "u" that might be >= 10^2*N // where N is the size of "u" in bytes //===----------------------------------------------------------===// - template -static inline char * uitoa(char * p, UInt u) +ALWAYS_INLINE inline char * uitoa(char * p, UInt u) { if (u < pow10>(N)) return head(p, UnsignedOfSize(u)); @@ -272,7 +270,7 @@ static inline char * uitoa(char * p, UInt u) // selected when "u" is one byte template <> -inline char * uitoa, 1>(char * p, UnsignedOfSize<1> u) +ALWAYS_INLINE inline char * uitoa, 1>(char * p, UnsignedOfSize<1> u) { if (u < 10) return outDigit(p, u); @@ -292,14 +290,14 @@ inline char * uitoa, 1>(char * p, UnsignedOfSize<1> u) // itoa: handle unsigned integral operands (selected by SFINAE) template && std::is_integral_v> * = nullptr> -static inline char * itoa(U u, char * p) +ALWAYS_INLINE inline char * itoa(U u, char * p) { return convert::uitoa(p, u); } // itoa: handle signed integral operands (selected by SFINAE) template && std::is_integral_v> * = nullptr> -static inline char * itoa(I i, char * p) +ALWAYS_INLINE inline char * itoa(I i, char * p) { // Need "mask" to be filled with a copy of the sign bit. // If "i" is a negative value, then the result of "operator >>" @@ -335,63 +333,128 @@ static inline char * itoa(I i, char * p) } -template -static NO_INLINE char * writeUIntText(T _x, char * p) +const uint64_t max_multiple_of_hundred_that_fits_in_64_bits = 1'00'00'00'00'00'00'00'00'00ull; +constexpr int max_multiple_of_hundred_blocks = 9; +static_assert(max_multiple_of_hundred_that_fits_in_64_bits % 100 == 0); + +ALWAYS_INLINE inline char * writeUIntText(UInt128 _x, char * p) { - static_assert(std::is_same_v || std::is_same_v); - using T_ = std::conditional_t< - std::is_same_v, - unsigned __int128, + /// If we the highest 8 byte item is empty, we can print only the lowest item as i64 + if (_x.items[UInt128::_impl::little(1)] == 0) + return convert::itoa(_x.items[UInt128::_impl::little(0)], p); + + /// Doing operations using __int128 is faster, as we already rely on this feature + using T = unsigned __int128; + T x = (T(_x.items[UInt128::_impl::little(1)]) << 64) + T(_x.items[UInt128::_impl::little(0)]); + + /// We are going to accumulate blocks of 2 digits to print until the number is small enough to be printed as u64 + /// To do this we could do: x / 100, x % 100 + /// But this is too many iterations with long integers, so instead we can divide by a much longer integer + /// max_multiple_of_hundred_that_fits_in_64_bits and then get the blocks out of this (as u64) + static const T large_divisor = max_multiple_of_hundred_that_fits_in_64_bits; + static const T largest_uint64 = std::numeric_limits::max(); + uint8_t two_values[20] = {0}; // 39 Max characters / 2 + + int current_block = 0; + while (x > largest_uint64) + { + uint64_t remainder = uint64_t(x % large_divisor); + x /= large_divisor; + + int pos = current_block; + while (remainder) + { + two_values[pos] = uint8_t(remainder % 100); + pos++; + remainder /= 100; + } + current_block += max_multiple_of_hundred_blocks; + } + + char * highest_part_print = convert::itoa(uint64_t(x), p); + for (int i = 0; i < current_block; i++) + { + outTwoDigits(highest_part_print, two_values[current_block - 1 - i]); + highest_part_print += 2; + } + + return highest_part_print; +} + +ALWAYS_INLINE inline char * writeUIntText(UInt256 _x, char * p) +{ + /// If possible, treat it as a smaller integer as they are much faster to print + if (_x.items[UInt256::_impl::little(3)] == 0 && _x.items[UInt256::_impl::little(2)] == 0) + return writeUIntText(UInt128{_x.items[UInt256::_impl::little(0)], _x.items[UInt256::_impl::little(1)]}, p); + + /// If available (x86) we transform from our custom class to _BitInt(256) which has better support in the compiler + /// and produces better code + using T = #if defined(__x86_64__) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wbit-int-extension" unsigned _BitInt(256) # pragma clang diagnostic pop #else - T + UInt256 #endif - >; + ; - T_ x; - T_ hundred(100ULL); - if constexpr (std::is_same_v) - { - x = (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); - } - else - { #if defined(__x86_64__) - x = (T_(_x.items[T::_impl::little(3)]) << 192) + (T_(_x.items[T::_impl::little(2)]) << 128) - + (T_(_x.items[T::_impl::little(1)]) << 64) + T_(_x.items[T::_impl::little(0)]); + T x = (T(_x.items[UInt256::_impl::little(3)]) << 192) + (T(_x.items[UInt256::_impl::little(2)]) << 128) + + (T(_x.items[UInt256::_impl::little(1)]) << 64) + T(_x.items[UInt256::_impl::little(0)]); #else - x = _x; + T x = _x; #endif + + /// Similar to writeUIntText(UInt128) only that in this case we will stop as soon as we reach the largest u128 + /// and switch to that function + uint8_t two_values[39] = {0}; // 78 Max characters / 2 + int current_pos = 0; + + static const T large_divisor = max_multiple_of_hundred_that_fits_in_64_bits; + static const T largest_uint128 = T(std::numeric_limits::max()) << 64 | T(std::numeric_limits::max()); + + while (x > largest_uint128) + { + uint64_t remainder = uint64_t(x % large_divisor); + x /= large_divisor; + + int pos = current_pos; + while (remainder) + { + two_values[pos] = uint8_t(remainder % 100); + pos++; + remainder /= 100; + } + current_pos += max_multiple_of_hundred_blocks; } - int len = digits10(x); - auto * pp = p + len; - while (x >= hundred) +#if defined(__x86_64__) + UInt128 pending{uint64_t(x), uint64_t(x >> 64)}; +#else + UInt128 pending{x.items[UInt256::_impl::little(0)], x.items[UInt256::_impl::little(1)]}; +#endif + + char * highest_part_print = writeUIntText(pending, p); + for (int i = 0; i < current_pos; i++) { - const auto i = x % hundred; - x /= hundred; - pp -= 2; - outTwoDigits(pp, i); + outTwoDigits(highest_part_print, two_values[current_pos - 1 - i]); + highest_part_print += 2; } - if (x < 10) - *p = '0' + x; - else - outTwoDigits(p, x); - return p + len; + + return highest_part_print; } -static ALWAYS_INLINE inline char * writeLeadingMinus(char * pos) + +ALWAYS_INLINE inline char * writeLeadingMinus(char * pos) { *pos = '-'; return pos + 1; } template -static ALWAYS_INLINE inline char * writeSIntText(T x, char * pos) +ALWAYS_INLINE inline char * writeSIntText(T x, char * pos) { static_assert(std::is_same_v || std::is_same_v); From b0b38121a60682ee79d47914e18dd57d892f394e Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Thu, 7 Mar 2024 20:44:03 +0000 Subject: [PATCH 0141/1081] Crash repro --- src/Storages/StorageMergeTree.cpp | 1 + ...eplicas_cte_explain_syntax_crash.reference | 15 +++++++ ...llel_replicas_cte_explain_syntax_crash.sql | 44 +++++++++++++++++++ 3 files changed, 60 insertions(+) create mode 100644 tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.reference create mode 100644 tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index e15b308f084..dbc901d4a5f 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -220,6 +220,7 @@ void StorageMergeTree::read( Block header; if (local_context->getSettingsRef().allow_experimental_analyzer) { + chassert(query_info.query_tree); QueryTreeNodePtr modified_query_tree = query_info.query_tree->clone(); rewriteJoinToGlobalJoin(modified_query_tree, local_context); modified_query_tree = buildQueryTreeForShard(query_info.planner_context, modified_query_tree); diff --git a/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.reference b/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.reference new file mode 100644 index 00000000000..078890722fd --- /dev/null +++ b/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.reference @@ -0,0 +1,15 @@ +WITH + cte1 AS + ( + SELECT n + FROM numbers_1e6__fuzz_34 + ), + cte2 AS + ( + SELECT n + FROM numbers_1e6__fuzz_33 + PREWHERE n IN cte1 + ) +SELECT count() +FROM +cte2 diff --git a/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql b/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql new file mode 100644 index 00000000000..a407fceb1c6 --- /dev/null +++ b/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql @@ -0,0 +1,44 @@ +DROP TABLE IF EXISTS numbers_1e6__fuzz_34; +DROP TABLE IF EXISTS numbers_1e6__fuzz_33; + +CREATE TABLE numbers_1e6__fuzz_34 +( + `n` LowCardinality(Nullable(UInt8)) +) +ENGINE = MergeTree +ORDER BY n +SETTINGS allow_nullable_key = 1 +AS SELECT * +FROM numbers(1000000) +SETTINGS allow_suspicious_low_cardinality_types = 1; + + +CREATE TABLE numbers_1e6__fuzz_33 +( + `n` LowCardinality(Nullable(UInt8)) +) +ENGINE = MergeTree +ORDER BY n +SETTINGS allow_nullable_key = 1 +AS SELECT * +FROM numbers(1000000) +SETTINGS allow_suspicious_low_cardinality_types = 1; + +SET allow_experimental_analyzer = 0; +SET allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; +EXPLAIN SYNTAX +WITH + cte1 AS + ( + SELECT n + FROM numbers_1e6__fuzz_34 + ), + cte2 AS + ( + SELECT n + FROM numbers_1e6__fuzz_33 + PREWHERE n IN (cte1) + ) +SELECT count() +FROM cte2; +-- SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; From cde811804f68656577dd1497e511a4a9295d4544 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 8 Mar 2024 12:20:34 +0800 Subject: [PATCH 0142/1081] prevent memcpySmallAllowReadWriteOverflow15Impl optimized to memcpy --- src/Common/memcpySmall.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Common/memcpySmall.h b/src/Common/memcpySmall.h index f3d26c60380..f5e9f31fc46 100644 --- a/src/Common/memcpySmall.h +++ b/src/Common/memcpySmall.h @@ -49,6 +49,9 @@ namespace detail dst += 16; src += 16; n -= 16; + + /// Avoid clang loop-idion optimization, which transforms _mm_storeu_si128 to built-in memcpy + __asm__ __volatile__("" : : : "memory"); } } } From 89ae39e598a481dbb2c610ee7dca1fc7272517b7 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Fri, 8 Mar 2024 12:22:07 +0800 Subject: [PATCH 0143/1081] optimize column string replicate --- src/Columns/ColumnString.cpp | 38 ++++++++++++++++++++---------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/Columns/ColumnString.cpp b/src/Columns/ColumnString.cpp index b9128372cea..0c52a7be086 100644 --- a/src/Columns/ColumnString.cpp +++ b/src/Columns/ColumnString.cpp @@ -460,6 +460,7 @@ void ColumnString::updatePermutationWithCollation(const Collator & collator, Per DefaultPartialSort()); } + ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const { size_t col_size = size(); @@ -471,32 +472,35 @@ ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const if (0 == col_size) return res; - Chars & res_chars = res->chars; Offsets & res_offsets = res->offsets; - res_chars.reserve_exact(chars.size() / col_size * replicate_offsets.back()); - res_offsets.reserve_exact(replicate_offsets.back()); - - Offset prev_replicate_offset = 0; - Offset prev_string_offset = 0; - Offset current_new_offset = 0; + res_offsets.resize_exact(replicate_offsets.back()); + Chars & res_chars = res->chars; + size_t res_chars_size = 0; for (size_t i = 0; i < col_size; ++i) { - size_t size_to_replicate = replicate_offsets[i] - prev_replicate_offset; - size_t string_size = offsets[i] - prev_string_offset; + size_t size_to_replicate = replicate_offsets[i] - replicate_offsets[i - 1]; + size_t string_size = offsets[i] - offsets[i - 1]; + res_chars_size += size_to_replicate * string_size; + } + res_chars.resize_exact(res_chars_size); + size_t curr_row = 0; + size_t curr_offset = 0; + for (size_t i = 0; i < col_size; ++i) + { + const size_t size_to_replicate = replicate_offsets[i] - replicate_offsets[i - 1]; + const size_t string_size = offsets[i] - offsets[i-1]; + const UInt8 * src = &chars[offsets[i - 1]]; for (size_t j = 0; j < size_to_replicate; ++j) { - current_new_offset += string_size; - res_offsets.push_back(current_new_offset); - - res_chars.resize(res_chars.size() + string_size); memcpySmallAllowReadWriteOverflow15( - &res_chars[res_chars.size() - string_size], &chars[prev_string_offset], string_size); - } + &res_chars[curr_offset], src, string_size); - prev_replicate_offset = replicate_offsets[i]; - prev_string_offset = offsets[i]; + curr_offset += string_size; + res_offsets[curr_row] = curr_offset; + ++curr_row; + } } return res; From 391af00b64e62dc7d9e1c5726c1aa202f19dffd8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Fri, 8 Mar 2024 13:47:35 +0800 Subject: [PATCH 0144/1081] Update memcpySmall.h --- src/Common/memcpySmall.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/memcpySmall.h b/src/Common/memcpySmall.h index f5e9f31fc46..90648254d76 100644 --- a/src/Common/memcpySmall.h +++ b/src/Common/memcpySmall.h @@ -50,7 +50,7 @@ namespace detail src += 16; n -= 16; - /// Avoid clang loop-idion optimization, which transforms _mm_storeu_si128 to built-in memcpy + /// Avoid clang loop-idiom optimization, which transforms _mm_storeu_si128 to built-in memcpy __asm__ __volatile__("" : : : "memory"); } } From d52027c5a09f0f9619bc2f5df639f1a042b2c084 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 8 Mar 2024 12:00:34 +0100 Subject: [PATCH 0145/1081] Style and comments --- base/base/itoa.cpp | 47 +++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index 08912edf3ea..4475ae416b9 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -165,16 +165,16 @@ ALWAYS_INLINE inline char * outDigit(char * p, uint8_t value) // into ascii characters as described by Andrei Alexandrescu in // https://www.facebook.com/notes/facebook-engineering/three-optimization-tips-for-c/10151361643253920/ -static const char digits[201] = "00010203040506070809" - "10111213141516171819" - "20212223242526272829" - "30313233343536373839" - "40414243444546474849" - "50515253545556575859" - "60616263646566676869" - "70717273747576777879" - "80818283848586878889" - "90919293949596979899"; +const char digits[201] = "00010203040506070809" + "10111213141516171819" + "20212223242526272829" + "30313233343536373839" + "40414243444546474849" + "50515253545556575859" + "60616263646566676869" + "70717273747576777879" + "80818283848586878889" + "90919293949596979899"; ALWAYS_INLINE inline char * outTwoDigits(char * p, uint8_t value) { @@ -334,23 +334,24 @@ ALWAYS_INLINE inline char * itoa(I i, char * p) const uint64_t max_multiple_of_hundred_that_fits_in_64_bits = 1'00'00'00'00'00'00'00'00'00ull; -constexpr int max_multiple_of_hundred_blocks = 9; +const int max_multiple_of_hundred_blocks = 9; static_assert(max_multiple_of_hundred_that_fits_in_64_bits % 100 == 0); ALWAYS_INLINE inline char * writeUIntText(UInt128 _x, char * p) { - /// If we the highest 8 byte item is empty, we can print only the lowest item as i64 + /// If we the highest 64bit item is empty, we can print just the lowest item as u64 if (_x.items[UInt128::_impl::little(1)] == 0) return convert::itoa(_x.items[UInt128::_impl::little(0)], p); - /// Doing operations using __int128 is faster, as we already rely on this feature + /// Doing operations using __int128 is faster and we already rely on this feature using T = unsigned __int128; T x = (T(_x.items[UInt128::_impl::little(1)]) << 64) + T(_x.items[UInt128::_impl::little(0)]); /// We are going to accumulate blocks of 2 digits to print until the number is small enough to be printed as u64 /// To do this we could do: x / 100, x % 100 - /// But this is too many iterations with long integers, so instead we can divide by a much longer integer - /// max_multiple_of_hundred_that_fits_in_64_bits and then get the blocks out of this (as u64) + /// But these would mean doing many iterations with long integers, so instead we divide by a much longer integer + /// multiple of 100 (100^9) and then get the blocks out of it (as u64) + /// Once we reach u64::max we can stop and use the fast method to print that in the front static const T large_divisor = max_multiple_of_hundred_that_fits_in_64_bits; static const T largest_uint64 = std::numeric_limits::max(); uint8_t two_values[20] = {0}; // 39 Max characters / 2 @@ -358,15 +359,15 @@ ALWAYS_INLINE inline char * writeUIntText(UInt128 _x, char * p) int current_block = 0; while (x > largest_uint64) { - uint64_t remainder = uint64_t(x % large_divisor); + uint64_t u64_remainder = uint64_t(x % large_divisor); x /= large_divisor; int pos = current_block; - while (remainder) + while (u64_remainder) { - two_values[pos] = uint8_t(remainder % 100); + two_values[pos] = uint8_t(u64_remainder % 100); pos++; - remainder /= 100; + u64_remainder /= 100; } current_block += max_multiple_of_hundred_blocks; } @@ -417,15 +418,15 @@ ALWAYS_INLINE inline char * writeUIntText(UInt256 _x, char * p) while (x > largest_uint128) { - uint64_t remainder = uint64_t(x % large_divisor); + uint64_t u64_remainder = uint64_t(x % large_divisor); x /= large_divisor; int pos = current_pos; - while (remainder) + while (u64_remainder) { - two_values[pos] = uint8_t(remainder % 100); + two_values[pos] = uint8_t(u64_remainder % 100); pos++; - remainder /= 100; + u64_remainder /= 100; } current_pos += max_multiple_of_hundred_blocks; } From 2f3c103367de4fab57602e3fc1821608df718c77 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 8 Mar 2024 12:02:53 +0100 Subject: [PATCH 0146/1081] OSX quirks --- base/base/itoa.cpp | 4 ++++ base/base/itoa.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index 4475ae416b9..d877f15b563 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -542,6 +542,10 @@ char * itoa(Int256 i, char * p) #define INSTANTIATION(T) template char * itoa(T i, char * p); FOR_MISSING_INTEGER_TYPES(INSTANTIATION) +#if defined(OS_DARWIN) +INSTANTIATION(size_t) +#endif + #undef FOR_MISSING_INTEGER_TYPES #undef INSTANTIATION diff --git a/base/base/itoa.h b/base/base/itoa.h index 71603cdeb88..98a570b12fa 100644 --- a/base/base/itoa.h +++ b/base/base/itoa.h @@ -25,6 +25,10 @@ template <> char * itoa(Int256 i, char * p); extern template char * itoa(T i, char * p); FOR_MISSING_INTEGER_TYPES(INSTANTIATION) +#if defined(OS_DARWIN) +INSTANTIATION(size_t) +#endif + #undef FOR_MISSING_INTEGER_TYPES #undef INSTANTIATION From e7cc49212a15ca8bcf87950225e066d04c8823e4 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 8 Mar 2024 12:28:14 +0100 Subject: [PATCH 0147/1081] fix tidy build --- src/Functions/array/arrayDistance.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 03f0bc7b286..0045075ddef 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -523,18 +523,18 @@ private: const auto & offsets_y = array_y.getOffsets(); ColumnArray::Offset prev_offset = 0; - for (size_t row = 0; row < offsets_y.size(); ++row) + for (auto offset_y : offsets_y) { - if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] + if (offsets_x[0] != offset_y - prev_offset) [[unlikely]] { throw Exception( ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Arguments of function {} have different array sizes: {} and {}", getName(), offsets_x[0], - offsets_y[row] - prev_offset); + offset_y - prev_offset); } - prev_offset = offsets_y[row]; + prev_offset = offset_y; } const typename Kernel::ConstParams kernel_params = initConstParams(arguments); From 07ce390609238318a7ab115b3d7020f03150ce76 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 8 Mar 2024 13:59:24 +0100 Subject: [PATCH 0148/1081] No public templates, just happiness --- base/base/itoa.cpp | 44 +++++++++++++++++++++----------------------- base/base/itoa.h | 35 ++++++++++++++--------------------- 2 files changed, 35 insertions(+), 44 deletions(-) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index d877f15b563..3c4f0bb048d 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -32,9 +32,11 @@ #include #include +namespace +{ template -inline int digits10(T x) +int digits10T(T x) { if (x < 10ULL) return 1; @@ -64,13 +66,9 @@ inline int digits10(T x) return 11 + (x >= 100000000000ULL); } - return 12 + digits10(x / 1000000000000ULL); + return 12 + digits10T(x / 1000000000000ULL); } - -namespace -{ - template ALWAYS_INLINE inline constexpr T pow10(size_t x) { @@ -487,67 +485,62 @@ ALWAYS_INLINE inline char * writeSIntText(T x, char * pos) } } -template -char * itoa(T i, char * p) -{ - return convert::itoa(i, p); -} -template <> char * itoa(UInt8 i, char * p) { return convert::itoa(uint8_t(i), p); } -template <> char * itoa(Int8 i, char * p) { return convert::itoa(int8_t(i), p); } -template <> char * itoa(UInt128 i, char * p) { return writeUIntText(i, p); } -template <> char * itoa(Int128 i, char * p) { return writeSIntText(i, p); } -template <> char * itoa(UInt256 i, char * p) { return writeUIntText(i, p); } -template <> char * itoa(Int256 i, char * p) { return writeSIntText(i, p); } +#define DEFAULT_ITOA(T) \ + char * itoa(T i, char * p) \ + { \ + return convert::itoa(i, p); \ + } + #define FOR_MISSING_INTEGER_TYPES(M) \ - M(int8_t) \ M(uint8_t) \ M(UInt16) \ M(UInt32) \ M(UInt64) \ + M(int8_t) \ M(Int16) \ M(Int32) \ M(Int64) -#define INSTANTIATION(T) template char * itoa(T i, char * p); -FOR_MISSING_INTEGER_TYPES(INSTANTIATION) +FOR_MISSING_INTEGER_TYPES(DEFAULT_ITOA) #if defined(OS_DARWIN) -INSTANTIATION(size_t) +DEFAULT_ITOA(unsigned long) +DEFAULT_ITOA(long) #endif #undef FOR_MISSING_INTEGER_TYPES -#undef INSTANTIATION +#undef DEFAULT_ITOA #define DIGITS_INTEGER_TYPES(M) \ @@ -559,7 +552,12 @@ INSTANTIATION(size_t) M(UInt128) \ M(UInt256) -#define INSTANTIATION(T) template int digits10(T x); +#define INSTANTIATION(T) \ + int digits10(T x) \ + { \ + return digits10T(x); \ + } + DIGITS_INTEGER_TYPES(INSTANTIATION) #undef DIGITS_INTEGER_TYPES diff --git a/base/base/itoa.h b/base/base/itoa.h index 98a570b12fa..9a89fa739dd 100644 --- a/base/base/itoa.h +++ b/base/base/itoa.h @@ -2,39 +2,33 @@ #include -template char * itoa(T i, char * p); - -template <> char * itoa(UInt8 i, char * p); -template <> char * itoa(Int8 i, char * p); -template <> char * itoa(UInt128 i, char * p); -template <> char * itoa(Int128 i, char * p); -template <> char * itoa(UInt256 i, char * p); -template <> char * itoa(Int256 i, char * p); - -#define FOR_MISSING_INTEGER_TYPES(M) \ - M(int8_t) \ +#define FOR_INTEGER_TYPES(M) \ M(uint8_t) \ + M(UInt8) \ M(UInt16) \ M(UInt32) \ M(UInt64) \ + M(UInt128) \ + M(UInt256) \ + M(int8_t) \ + M(Int8) \ M(Int16) \ M(Int32) \ - M(Int64) + M(Int64) \ + M(Int128) \ + M(Int256) -#define INSTANTIATION(T) \ - extern template char * itoa(T i, char * p); -FOR_MISSING_INTEGER_TYPES(INSTANTIATION) +#define INSTANTIATION(T) char * itoa(T i, char * p); +FOR_INTEGER_TYPES(INSTANTIATION) #if defined(OS_DARWIN) -INSTANTIATION(size_t) +INSTANTIATION(unsigned long) +INSTANTIATION(long) #endif #undef FOR_MISSING_INTEGER_TYPES #undef INSTANTIATION - -template int digits10(T x); - #define DIGITS_INTEGER_TYPES(M) \ M(uint8_t) \ M(UInt8) \ @@ -43,8 +37,7 @@ template int digits10(T x); M(UInt64) \ M(UInt128) \ M(UInt256) -#define INSTANTIATION(T) \ - extern template int digits10(T x); +#define INSTANTIATION(T) int digits10(T x); DIGITS_INTEGER_TYPES(INSTANTIATION) #undef DIGITS_INTEGER_TYPES #undef INSTANTIATION From 4b964979c2c831a2ddeee5a7b0c10066f520ab11 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 8 Mar 2024 14:22:33 +0100 Subject: [PATCH 0149/1081] Digits doesn't belong with itoa anymore --- base/base/itoa.cpp | 55 ----------------------------------- base/base/itoa.h | 13 --------- src/Functions/countDigits.cpp | 34 ++++++++++++++++++++++ 3 files changed, 34 insertions(+), 68 deletions(-) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index 3c4f0bb048d..608258c6b56 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -35,40 +35,6 @@ namespace { -template -int digits10T(T x) -{ - if (x < 10ULL) - return 1; - if (x < 100ULL) - return 2; - if (x < 1000ULL) - return 3; - - if (x < 1000000000000ULL) - { - if (x < 100000000ULL) - { - if (x < 1000000ULL) - { - if (x < 10000ULL) - return 4; - else - return 5 + (x >= 100000ULL); - } - - return 7 + (x >= 10000000ULL); - } - - if (x < 10000000000ULL) - return 9 + (x >= 1000000000ULL); - - return 11 + (x >= 100000000000ULL); - } - - return 12 + digits10T(x / 1000000000000ULL); -} - template ALWAYS_INLINE inline constexpr T pow10(size_t x) { @@ -541,24 +507,3 @@ DEFAULT_ITOA(long) #undef FOR_MISSING_INTEGER_TYPES #undef DEFAULT_ITOA - - -#define DIGITS_INTEGER_TYPES(M) \ - M(uint8_t) \ - M(UInt8) \ - M(UInt16) \ - M(UInt32) \ - M(UInt64) \ - M(UInt128) \ - M(UInt256) - -#define INSTANTIATION(T) \ - int digits10(T x) \ - { \ - return digits10T(x); \ - } - -DIGITS_INTEGER_TYPES(INSTANTIATION) - -#undef DIGITS_INTEGER_TYPES -#undef INSTANTIATION diff --git a/base/base/itoa.h b/base/base/itoa.h index 9a89fa739dd..e69ce0ef17d 100644 --- a/base/base/itoa.h +++ b/base/base/itoa.h @@ -28,16 +28,3 @@ INSTANTIATION(long) #undef FOR_MISSING_INTEGER_TYPES #undef INSTANTIATION - -#define DIGITS_INTEGER_TYPES(M) \ - M(uint8_t) \ - M(UInt8) \ - M(UInt16) \ - M(UInt32) \ - M(UInt64) \ - M(UInt128) \ - M(UInt256) -#define INSTANTIATION(T) int digits10(T x); -DIGITS_INTEGER_TYPES(INSTANTIATION) -#undef DIGITS_INTEGER_TYPES -#undef INSTANTIATION diff --git a/src/Functions/countDigits.cpp b/src/Functions/countDigits.cpp index 2ca8d944b0a..f2712b5b301 100644 --- a/src/Functions/countDigits.cpp +++ b/src/Functions/countDigits.cpp @@ -20,6 +20,40 @@ namespace ErrorCodes namespace { +template +int digits10(T x) +{ + if (x < 10ULL) + return 1; + if (x < 100ULL) + return 2; + if (x < 1000ULL) + return 3; + + if (x < 1000000000000ULL) + { + if (x < 100000000ULL) + { + if (x < 1000000ULL) + { + if (x < 10000ULL) + return 4; + else + return 5 + (x >= 100000ULL); + } + + return 7 + (x >= 10000000ULL); + } + + if (x < 10000000000ULL) + return 9 + (x >= 1000000000ULL); + + return 11 + (x >= 100000000000ULL); + } + + return 12 + digits10(x / 1000000000000ULL); +} + /// Returns number of decimal digits you need to represent the value. /// For Decimal values takes in account their scales: calculates result over underlying int type which is (value * scale). /// countDigits(42) = 2, countDigits(42.000) = 5, countDigits(0.04200) = 4. From 4f27dd8f9c887cec6273dc7960b165ee2537ad26 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Fri, 8 Mar 2024 14:27:50 +0100 Subject: [PATCH 0150/1081] upd test --- .../0_stateless/02864_restore_table_with_broken_part.reference | 2 +- .../queries/0_stateless/02864_restore_table_with_broken_part.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/02864_restore_table_with_broken_part.reference b/tests/queries/0_stateless/02864_restore_table_with_broken_part.reference index 9a8dcda81df..9247a7d6ab6 100644 --- a/tests/queries/0_stateless/02864_restore_table_with_broken_part.reference +++ b/tests/queries/0_stateless/02864_restore_table_with_broken_part.reference @@ -1,4 +1,4 @@ -data.bin doesn't exist: while restoring part all_2_2_0 +OK RESTORED 1 3 diff --git a/tests/queries/0_stateless/02864_restore_table_with_broken_part.sh b/tests/queries/0_stateless/02864_restore_table_with_broken_part.sh index cf99c7e9284..d3252b29eb7 100755 --- a/tests/queries/0_stateless/02864_restore_table_with_broken_part.sh +++ b/tests/queries/0_stateless/02864_restore_table_with_broken_part.sh @@ -26,7 +26,7 @@ ln -s "$SRC_BACKUP_DIR/$SRC_BACKUP_FILENAME" "$BACKUPS_DIR/$BACKUP_FILENAME" $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS tbl" # First try to restore with the setting `restore_broken_parts_as_detached` set to false. -$CLICKHOUSE_CLIENT --query "RESTORE TABLE default.tbl AS tbl FROM $BACKUP_NAME" 2>&1 | grep -o -m 1 "data.bin doesn't exist: while restoring part all_2_2_0" +$CLICKHOUSE_CLIENT --query "RESTORE TABLE default.tbl AS tbl FROM $BACKUP_NAME" 2>&1 | tr -d \\n | grep "data.bin doesn't exist" | grep "while restoring part all_2_2_0" > /dev/null && echo "OK" || echo "FAILED" $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS tbl" From 45a1f0f8ed33fb09fee92137a391dfd25ba63f05 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 8 Mar 2024 14:02:34 +0000 Subject: [PATCH 0151/1081] More consistency and extended tests --- src/Functions/array/arrayDistance.cpp | 8 ++-- src/Functions/array/arrayDotProduct.cpp | 43 ++++++++----------- .../0_stateless/02708_dotProduct.reference | 12 ++++++ .../queries/0_stateless/02708_dotProduct.sql | 16 +++++-- 4 files changed, 47 insertions(+), 32 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 0045075ddef..8b591e37ff6 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -471,10 +471,9 @@ private: const typename Kernel::ConstParams kernel_params = initConstParams(arguments); - auto result = ColumnVector::create(input_rows_count); - auto & result_data = result->getData(); + auto col_res = ColumnVector::create(input_rows_count); + auto & result_data = col_res->getData(); - /// Do the actual computation ColumnArray::Offset prev = 0; size_t row = 0; @@ -503,7 +502,7 @@ private: result_data[row] = Kernel::finalize(state, kernel_params); row++; } - return result; + return col_res; } /// Special case when the 1st parameter is Const @@ -542,7 +541,6 @@ private: auto result = ColumnVector::create(input_rows_count); auto & result_data = result->getData(); - /// Do the actual computation size_t prev = 0; size_t row = 0; diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index 8b7c85e05dd..97dc9653bab 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -140,6 +140,7 @@ public: static FunctionPtr create(ContextPtr) { return std::make_shared(); } size_t getNumberOfArguments() const override { return 2; } bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + bool useDefaultImplementationForConstants() const override { return true; } DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { @@ -174,13 +175,13 @@ public: ACTION(Float32) \ ACTION(Float64) - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /* input_rows_count */) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { switch (result_type->getTypeId()) { #define ON_TYPE(type) \ case TypeIndex::type: \ - return executeWithResultType(arguments); \ + return executeWithResultType(arguments, input_rows_count); \ break; SUPPORTED_TYPES(ON_TYPE) @@ -193,7 +194,7 @@ public: private: template - ColumnPtr executeWithResultType(const ColumnsWithTypeAndName & arguments) const + ColumnPtr executeWithResultType(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const { DataTypePtr type_x = typeid_cast(arguments[0].type.get())->getNestedType(); @@ -201,7 +202,7 @@ private: { #define ON_TYPE(type) \ case TypeIndex::type: \ - return executeWithResultTypeAndLeftType(arguments); \ + return executeWithResultTypeAndLeftType(arguments, input_rows_count); \ break; SUPPORTED_TYPES(ON_TYPE) @@ -218,7 +219,7 @@ private: } template - ColumnPtr executeWithResultTypeAndLeftType(const ColumnsWithTypeAndName & arguments) const + ColumnPtr executeWithResultTypeAndLeftType(const ColumnsWithTypeAndName & arguments, size_t input_rows_count) const { DataTypePtr type_y = typeid_cast(arguments[1].type.get())->getNestedType(); @@ -226,7 +227,7 @@ private: { #define ON_TYPE(type) \ case TypeIndex::type: \ - return executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column); \ + return executeWithResultTypeAndLeftTypeAndRightType(arguments[0].column, arguments[1].column, input_rows_count); \ break; SUPPORTED_TYPES(ON_TYPE) @@ -243,15 +244,15 @@ private: } template - ColumnPtr executeWithResultTypeAndLeftTypeAndRightType(ColumnPtr col_x, ColumnPtr col_y) const + ColumnPtr executeWithResultTypeAndLeftTypeAndRightType(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count) const { if (typeid_cast(col_x.get())) { - return executeWithLeftArgConst(col_x, col_y); + return executeWithLeftArgConst(col_x, col_y, input_rows_count); } else if (typeid_cast(col_y.get())) { - return executeWithLeftArgConst(col_y, col_x); + return executeWithLeftArgConst(col_y, col_x, input_rows_count); } col_x = col_x->convertToFullColumnIfConst(); @@ -268,16 +269,13 @@ private: if (!array_x.hasEqualOffsets(array_y)) throw Exception(ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, "Array arguments for function {} must have equal sizes", getName()); - auto col_res = ColumnVector::create(); - auto & result = col_res->getData(); - - size_t size = offsets_x.size(); - result.resize(size); + auto col_res = ColumnVector::create(input_rows_count); + auto & result_data = col_res->getData(); ColumnArray::Offset current_offset = 0; - for (size_t row = 0; row < size; ++row) + for (size_t row = 0; row < input_rows_count; ++row) { - size_t array_size = offsets_x[row] - current_offset; + const size_t array_size = offsets_x[row] - current_offset; size_t i = 0; @@ -298,7 +296,7 @@ private: for (; i < array_size; ++i) Kernel::template accumulate(state, static_cast(data_x[current_offset + i]), static_cast(data_y[current_offset + i])); - result[row] = Kernel::template finalize(state); + result_data[row] = Kernel::template finalize(state); current_offset = offsets_x[row]; } @@ -307,7 +305,7 @@ private: } template - ColumnPtr executeWithLeftArgConst(ColumnPtr col_x, ColumnPtr col_y) const + ColumnPtr executeWithLeftArgConst(ColumnPtr col_x, ColumnPtr col_y, size_t input_rows_count) const { col_x = assert_cast(col_x.get())->getDataColumnPtr(); col_y = col_y->convertToFullColumnIfConst(); @@ -336,16 +334,13 @@ private: prev_offset = offset_y; } - auto col_res = ColumnVector::create(); + auto col_res = ColumnVector::create(input_rows_count); auto & result = col_res->getData(); - size_t size = offsets_y.size(); - result.resize(size); - ColumnArray::Offset current_offset = 0; - for (size_t row = 0; row < size; ++row) + for (size_t row = 0; row < input_rows_count; ++row) { - size_t array_size = offsets_x[0]; + const size_t array_size = offsets_x[0]; typename Kernel::template State state; size_t i = 0; diff --git a/tests/queries/0_stateless/02708_dotProduct.reference b/tests/queries/0_stateless/02708_dotProduct.reference index 593071a3521..93a67e4c0be 100644 --- a/tests/queries/0_stateless/02708_dotProduct.reference +++ b/tests/queries/0_stateless/02708_dotProduct.reference @@ -11,6 +11,8 @@ [-1,-2,-3] [4,5,6] -32 Int64 [1,2,3] [4,5,6] 32 Float32 [1,2,3] [4,5,6] 32 Float64 +[] [] 0 Float32 +[] [] 0 UInt16 -- Tuple (1,2,3) (4,5,6) 32 UInt64 (1,2,3) (4,5,6) 32 UInt64 @@ -24,6 +26,8 @@ (1,2,3) (4,5,6) 32 Float64 -- Non-const argument [1,2,3] [4,5,6] 32 UInt16 +[] [] 0 Float32 +[] [] 0 UInt16 -- Array with mixed element arguments types (result type is the supertype) [1,2,3] [4,5,6] 32 Float32 -- Tuple with mixed element arguments types @@ -33,8 +37,16 @@ 32 32 -- Tests that trigger special paths + -- non-const / non-const 0 61 1 186 +0 61 +1 186 +0 61 +1 186 + -- const / non-const +0 62 +1 187 0 62 1 187 0 62 diff --git a/tests/queries/0_stateless/02708_dotProduct.sql b/tests/queries/0_stateless/02708_dotProduct.sql index ac94ecc28d3..05c66777dff 100644 --- a/tests/queries/0_stateless/02708_dotProduct.sql +++ b/tests/queries/0_stateless/02708_dotProduct.sql @@ -19,6 +19,9 @@ SELECT [-1, -2, -3]::Array(Int32) AS x, [4, 5, 6]::Array(Int32) AS y, dotProduct SELECT [-1, -2, -3]::Array(Int64) AS x, [4, 5, 6]::Array(Int64) AS y, dotProduct(x, y) AS res, toTypeName(res); SELECT [1, 2, 3]::Array(Float32) AS x, [4, 5, 6]::Array(Float32) AS y, dotProduct(x, y) AS res, toTypeName(res); SELECT [1, 2, 3]::Array(Float64) AS x, [4, 5, 6]::Array(Float64) AS y, dotProduct(x, y) AS res, toTypeName(res); +-- empty arrays +SELECT []::Array(Float32) AS x, []::Array(Float32) AS y, dotProduct(x, y) AS res, toTypeName(res); +SELECT []::Array(UInt8) AS x, []::Array(UInt8) AS y, dotProduct(x, y) AS res, toTypeName(res); SELECT ' -- Tuple'; SELECT (1::UInt8, 2::UInt8, 3::UInt8) AS x, (4::UInt8, 5::UInt8, 6::UInt8) AS y, dotProduct(x, y) AS res, toTypeName(res); @@ -34,6 +37,8 @@ SELECT (1::Float64, 2::Float64, 3::Float64) AS x, (4::Float64, 5::Float64, 6::Fl SELECT '-- Non-const argument'; SELECT materialize([1::UInt8, 2::UInt8, 3::UInt8]) AS x, [4::UInt8, 5::UInt8, 6::UInt8] AS y, dotProduct(x, y) AS res, toTypeName(res); +SELECT materialize([]::Array(Float32)) AS x, []::Array(Float32) AS y, dotProduct(x, y) AS res, toTypeName(res); +SELECT materialize([]::Array(UInt8)) AS x, []::Array(UInt8) AS y, dotProduct(x, y) AS res, toTypeName(res); SELECT ' -- Array with mixed element arguments types (result type is the supertype)'; SELECT [1::UInt16, 2::UInt8, 3::Float32] AS x, [4::Int16, 5::Float32, 6::UInt8] AS y, dotProduct(x, y) AS res, toTypeName(res); @@ -50,7 +55,12 @@ SELECT '-- Tests that trigger special paths'; DROP TABLE IF EXISTS tab; CREATE TABLE tab(id UInt64, vec Array(Float32)) ENGINE = MergeTree ORDER BY id; INSERT INTO tab VALUES (0, [0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0, 3.0, 0.0, 1.0, 2.0]) (1, [5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]); -SELECT id, arrayDotProduct(vec, vec) FROM tab ORDER BY id; -- non-const / non-const -SELECT id, arrayDotProduct([5.0, 2.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]::Array(Float32), vec) FROM tab ORDER BY id; -- const / non-const -SELECT id, arrayDotProduct([5.0, 2.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]::Array(Float64), vec) FROM tab ORDER BY id; -- const / non-const +SELECT ' -- non-const / non-const'; +SELECT id, arrayDotProduct(vec, vec) FROM tab ORDER BY id; +SELECT id, arrayDotProduct(vec::Array(Float64), vec::Array(Float64)) FROM tab ORDER BY id; +SELECT id, arrayDotProduct(vec::Array(UInt32), vec::Array(UInt32)) FROM tab ORDER BY id; +SELECT ' -- const / non-const'; +SELECT id, arrayDotProduct([5.0, 2.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]::Array(Float32), vec) FROM tab ORDER BY id; +SELECT id, arrayDotProduct([5.0, 2.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0, 3.0, 5.0, 1.0, 2.0]::Array(Float64), vec) FROM tab ORDER BY id; +SELECT id, arrayDotProduct([5, 2, 2, 3, 5, 1, 2, 3, 5, 1, 2, 3, 5, 1, 2, 3, 5, 1, 2]::Array(UInt32), vec) FROM tab ORDER BY id; DROP TABLE tab; From 45efa69189784ce65bffd0d84462dcb30c1e6bf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 8 Mar 2024 15:17:59 +0100 Subject: [PATCH 0152/1081] Add perf tests --- tests/performance/bigint_formatting.xml | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 tests/performance/bigint_formatting.xml diff --git a/tests/performance/bigint_formatting.xml b/tests/performance/bigint_formatting.xml new file mode 100644 index 00000000000..c3454f91d1d --- /dev/null +++ b/tests/performance/bigint_formatting.xml @@ -0,0 +1,13 @@ + + + CREATE TABLE bigint ( u128 UInt128, i128 Int128, u256 UInt256, i256 Int256) ENGINE = Memory + AS + SELECT * FROM generateRandom('u128 UInt128, i128 Int128, u256 UInt256, i256 Int256', 42) LIMIT 50000; + + DROP TABLE IF EXISTS bigint + + SELECT * FROM bigint WHERE NOT ignore(toString(u128)) SETTINGS max_threads = 1 + SELECT * FROM bigint WHERE NOT ignore(toString(i128)) SETTINGS max_threads = 1 + SELECT * FROM bigint WHERE NOT ignore(toString(u256)) SETTINGS max_threads = 1 + SELECT * FROM bigint WHERE NOT ignore(toString(i256)) SETTINGS max_threads = 1 + From d6e0dd45b9cc88b9002de68138440cd24452fb17 Mon Sep 17 00:00:00 2001 From: Amos Bird Date: Fri, 8 Mar 2024 22:57:49 +0800 Subject: [PATCH 0153/1081] Fix build --- src/Functions/coverage.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Functions/coverage.cpp b/src/Functions/coverage.cpp index f4cac26df78..0f4cd1940b7 100644 --- a/src/Functions/coverage.cpp +++ b/src/Functions/coverage.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include From 2196c75dd8ddaeb1d2f18ca7b05fb4ae37550a4b Mon Sep 17 00:00:00 2001 From: johnnymatthews <9611008+johnnymatthews@users.noreply.github.com> Date: Fri, 8 Mar 2024 11:07:04 -0400 Subject: [PATCH 0154/1081] Adds substring-UTF8 docs. --- .../functions/string-functions.md | 69 ++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 3b49e4954ed..f9c3f91a12b 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -588,8 +588,41 @@ Result: ## substringUTF8 -Like `substring` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. +Returns the substring of a string `s` which starts at the specified byte index `offset` for Unicode code points. Byte counting starts from `1`. If `offset` is `0`, an empty string is returned. If `offset` is negative, the substring starts `pos` characters from the end of the string, rather than from the beginning. An optional argument `length` specifies the maximum number of bytes the returned substring may have. +Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. + +**Syntax** + +```sql +substringUTF8(s, offset[, length]) +``` + +**Arguments** + +- `s`: The string to calculate a substring from. [String](../../sql-reference/data-types/string.md), [FixedString](../../sql-reference/data-types/fixedstring.md) or [Enum](../../sql-reference/data-types/enum.md) +- `offset`: The starting position of the substring in `s` . [(U)Int*](../../sql-reference/data-types/int-uint.md). +- `length`: The maximum length of the substring. [(U)Int*](../../sql-reference/data-types/int-uint.md). Optional. + +**Returned value** + +A substring of `s` with `length` many bytes, starting at index `offset`. + +**Implementation details** + +Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. + +**Example** + +```sql +SELECT 'database' AS string, substringUTF8(string, 5), substringUTF8(string, 5, 1) +``` + +```response +┌─string───┬─substringUTF8('database', 5)─┬─substringUTF8('database', 5, 1)─┐ +│ database │ base │ b │ +└──────────┴──────────────────────────────┴─────────────────────────────────┘ +``` ## substringIndex @@ -624,7 +657,39 @@ Result: ## substringIndexUTF8 -Like `substringIndex` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. +Returns the substring of `s` before `count` occurrences of the delimiter `delim`, specifically for Unicode code points. + +Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. + +**Syntax** + +```sql +substringIndexUTF8(s, delim, count) +``` + +**Arguments** + +- `s`: The string to extract substring from. [String](../../sql-reference/data-types/string.md). +- `delim`: The character to split. [String](../../sql-reference/data-types/string.md). +- `count`: The number of occurrences of the delimiter to count before extracting the substring. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. [UInt or Int](../data-types/int-uint.md) + +**Returned value** + +A substring [String](../../sql-reference/data-types/string.md) of `s` before `count` occurrences of `delim`. + +**Implementation details** + +Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. + +**Example** + +```sql +SELECT substringIndexUTF8('www.clickhouse.com', '.', 2) +``` + +```response +www.clickhouse +``` ## appendTrailingCharIfAbsent From 0336ef3557b0c3c05ef974a4c6aa6771b3aa0757 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 8 Mar 2024 16:14:11 +0100 Subject: [PATCH 0155/1081] reload CI From e2317477f7b95d07407db8def968d286aa9e270d Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 8 Mar 2024 17:12:31 +0100 Subject: [PATCH 0156/1081] fix removing is_active node after re-creation --- src/Databases/DatabaseReplicatedWorker.cpp | 2 ++ tests/integration/test_replicated_database/test.py | 2 ++ 2 files changed, 4 insertions(+) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 2056b403ff6..0a6e8f9345e 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -75,6 +75,8 @@ void DatabaseReplicatedDDLWorker::initializeReplication() String active_path = fs::path(database->replica_path) / "active"; String active_id = toString(ServerUUID::get()); zookeeper->deleteEphemeralNodeIfContentMatches(active_path, active_id); + if (active_node_holder) + active_node_holder->setAlreadyRemoved(); zookeeper->create(active_path, active_id, zkutil::CreateMode::Ephemeral); active_node_holder.reset(); active_node_holder_zookeeper = zookeeper; diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index b47f86a843d..4f449f9a296 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -1141,6 +1141,8 @@ def test_sync_replica(started_cluster): dummy_node.query("SYSTEM SYNC DATABASE REPLICA test_sync_database") + assert "2\n" == main_node.query("SELECT sum(is_active) FROM system.clusters WHERE cluster='test_sync_database'") + assert dummy_node.query( "SELECT count() FROM system.tables where database='test_sync_database'" ).strip() == str(number_of_tables) From 4997f95426786a037328d4ed2bbce2245144eb1f Mon Sep 17 00:00:00 2001 From: Han Fei Date: Fri, 8 Mar 2024 17:30:58 +0100 Subject: [PATCH 0157/1081] fix build --- programs/keeper/Keeper.cpp | 3 +++ programs/server/Server.cpp | 5 +++++ src/Common/CgroupsMemoryUsageObserver.cpp | 4 +--- src/Common/CgroupsMemoryUsageObserver.h | 10 ++++++++-- 4 files changed, 17 insertions(+), 5 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index 76dd8cb15a5..c2ad81a3227 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -630,6 +630,9 @@ try { observer.emplace(std::chrono::seconds(cgroups_memory_observer_wait_time)); observer->startThread(); + observer->setOnMemoryLimitUpdate([&](){ + main_config_reloader->reload(); + }); } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index ee55cfd1837..c81e9b56e35 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1701,6 +1701,11 @@ try throw; } + if (cgroups_memory_usage_observer) + cgroups_memory_usage_observer->setOnMemoryLimitUpdate([&](){ + main_config_reloader->reload(); + }); + /// Reload config in SYSTEM RELOAD CONFIG query. global_context->setConfigReloadCallback([&]() { diff --git a/src/Common/CgroupsMemoryUsageObserver.cpp b/src/Common/CgroupsMemoryUsageObserver.cpp index 5f24c2553b5..3fda51a119f 100644 --- a/src/Common/CgroupsMemoryUsageObserver.cpp +++ b/src/Common/CgroupsMemoryUsageObserver.cpp @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include @@ -291,8 +290,7 @@ void CgroupsMemoryUsageObserver::runThread() last_memory_amount = memory_limit; /// if we find memory amount changes, we just reload config. /// Reloading config will check the memory amount again and calculate soft/hard limit again. - auto global_context = getContext()->getGlobalContext(); - global_context->reloadConfig(); + on_memory_limit_update(); } std::lock_guard set_limit_lock(set_limit_mutex); if (soft_limit > 0 && hard_limit > 0) diff --git a/src/Common/CgroupsMemoryUsageObserver.h b/src/Common/CgroupsMemoryUsageObserver.h index 6edf2e2049d..639433b5016 100644 --- a/src/Common/CgroupsMemoryUsageObserver.h +++ b/src/Common/CgroupsMemoryUsageObserver.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include @@ -15,7 +14,7 @@ namespace DB /// - When the soft memory limit is hit, drop jemalloc cache. /// - When the hard memory limit is hit, update MemoryTracking metric to throw memory exceptions faster. #if defined(OS_LINUX) -class CgroupsMemoryUsageObserver : public WithContext +class CgroupsMemoryUsageObserver { public: enum class CgroupsVersion @@ -28,6 +27,10 @@ public: ~CgroupsMemoryUsageObserver(); void setLimits(uint64_t hard_limit_, uint64_t soft_limit_); + void setOnMemoryLimitUpdate(std::function on_memory_limit_update_) + { + on_memory_limit_update = on_memory_limit_update_; + } void startThread(); size_t getHardLimit() const { return hard_limit; } @@ -46,6 +49,7 @@ private: using CallbackFn = std::function; CallbackFn on_hard_limit; CallbackFn on_soft_limit; + std::function on_memory_limit_update; uint64_t last_usage = 0; @@ -87,8 +91,10 @@ public: void setLimits(uint64_t, uint64_t) {} size_t readMemoryUsage() { return 0; } + void startThread(); size_t getHardLimit() { return 0; } size_t getSoftLimit() { return 0; } + void setOnMemoryLimitUpdate(std::function) {} }; #endif From babe00003620ca34f228009d919d5613db867dee Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Fri, 8 Mar 2024 16:43:10 +0000 Subject: [PATCH 0158/1081] Automatic style fix --- tests/integration/test_replicated_database/test.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 4f449f9a296..881659262ac 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -1141,7 +1141,9 @@ def test_sync_replica(started_cluster): dummy_node.query("SYSTEM SYNC DATABASE REPLICA test_sync_database") - assert "2\n" == main_node.query("SELECT sum(is_active) FROM system.clusters WHERE cluster='test_sync_database'") + assert "2\n" == main_node.query( + "SELECT sum(is_active) FROM system.clusters WHERE cluster='test_sync_database'" + ) assert dummy_node.query( "SELECT count() FROM system.tables where database='test_sync_database'" From 68360aa522169b1c0955837e93687c6d3a124912 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Fri, 8 Mar 2024 18:37:03 +0100 Subject: [PATCH 0159/1081] Clang format --- base/base/itoa.cpp | 6 ------ 1 file changed, 6 deletions(-) diff --git a/base/base/itoa.cpp b/base/base/itoa.cpp index 608258c6b56..9bd2fcd1837 100644 --- a/base/base/itoa.cpp +++ b/base/base/itoa.cpp @@ -34,7 +34,6 @@ namespace { - template ALWAYS_INLINE inline constexpr T pow10(size_t x) { @@ -117,7 +116,6 @@ QuotientAndRemainder inline split(UnsignedOfSize value) return {quotient, remainder}; } - ALWAYS_INLINE inline char * outDigit(char * p, uint8_t value) { *p = '0' + value; @@ -147,7 +145,6 @@ ALWAYS_INLINE inline char * outTwoDigits(char * p, uint8_t value) return p; } - namespace convert { template @@ -296,7 +293,6 @@ ALWAYS_INLINE inline char * itoa(I i, char * p) } } - const uint64_t max_multiple_of_hundred_that_fits_in_64_bits = 1'00'00'00'00'00'00'00'00'00ull; const int max_multiple_of_hundred_blocks = 9; static_assert(max_multiple_of_hundred_that_fits_in_64_bits % 100 == 0); @@ -411,7 +407,6 @@ ALWAYS_INLINE inline char * writeUIntText(UInt256 _x, char * p) return highest_part_print; } - ALWAYS_INLINE inline char * writeLeadingMinus(char * pos) { *pos = '-'; @@ -451,7 +446,6 @@ ALWAYS_INLINE inline char * writeSIntText(T x, char * pos) } } - char * itoa(UInt8 i, char * p) { return convert::itoa(uint8_t(i), p); From bd530a175301860ba1ed13b5b97c56f12e7b13ce Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Fri, 8 Mar 2024 20:21:28 +0000 Subject: [PATCH 0160/1081] Forgot to enable analyzer --- .../03006_parallel_replicas_cte_explain_syntax_crash.sql | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql b/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql index a407fceb1c6..f46817d5e82 100644 --- a/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql +++ b/tests/queries/0_stateless/03006_parallel_replicas_cte_explain_syntax_crash.sql @@ -24,7 +24,7 @@ AS SELECT * FROM numbers(1000000) SETTINGS allow_suspicious_low_cardinality_types = 1; -SET allow_experimental_analyzer = 0; +SET allow_experimental_analyzer = 1; SET allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; EXPLAIN SYNTAX WITH @@ -41,4 +41,3 @@ WITH ) SELECT count() FROM cte2; --- SETTINGS allow_experimental_parallel_reading_from_replicas = 1, parallel_replicas_for_non_replicated_merge_tree = 1, cluster_for_parallel_replicas = 'test_cluster_one_shard_three_replicas_localhost', max_parallel_replicas = 3; From 2ce96f48f3c3958ef51c3e620b886d633436bb26 Mon Sep 17 00:00:00 2001 From: Alexander Tokmakov Date: Fri, 8 Mar 2024 21:58:01 +0100 Subject: [PATCH 0161/1081] Update 02962_system_sync_replica_lightweight_from_modifier.sh --- .../02962_system_sync_replica_lightweight_from_modifier.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/02962_system_sync_replica_lightweight_from_modifier.sh b/tests/queries/0_stateless/02962_system_sync_replica_lightweight_from_modifier.sh index f47801abf73..b61be87411d 100755 --- a/tests/queries/0_stateless/02962_system_sync_replica_lightweight_from_modifier.sh +++ b/tests/queries/0_stateless/02962_system_sync_replica_lightweight_from_modifier.sh @@ -14,7 +14,7 @@ export REPLICAS_TO_DROP for i in $(seq $TOTAL_REPLICAS); do $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table_$i" - $CLICKHOUSE_CLIENT --query "CREATE TABLE test_table_$i (key UInt64, value UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_table', '$i') ORDER BY key" + $CLICKHOUSE_CLIENT --query "CREATE TABLE test_table_$i (key UInt64, value UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_table', '$i') ORDER BY key SETTINGS old_parts_lifetime=1" done function insert_thread() { @@ -35,7 +35,7 @@ function sync_and_drop_replicas() { done for i in $(seq $REPLICAS_TO_DROP); do - $CLICKHOUSE_CLIENT --query "CREATE TABLE test_table_$i (key UInt64, value UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_table', '$i') ORDER BY key" + $CLICKHOUSE_CLIENT --query "CREATE TABLE test_table_$i (key UInt64, value UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/test_table', '$i') ORDER BY key SETTINGS old_parts_lifetime=1" done done } @@ -87,4 +87,4 @@ for i in $(seq $TOTAL_REPLICAS); do if [ $i -gt $REPLICAS_TO_DROP ]; then $CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS test_table_$i" fi -done \ No newline at end of file +done From 23b55ecbe8521692acab507408fc70665aa16f1b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 8 Mar 2024 22:02:22 +0100 Subject: [PATCH 0162/1081] beautify exception --- src/Functions/array/arrayDotProduct.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index 97dc9653bab..3f37e6f609f 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -212,7 +212,7 @@ private: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Arguments of function {} has nested type {}. " - "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + "Supported types: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", getName(), type_x->getName()); } From c01a6775d747606b3aae70c9615404720208aeda Mon Sep 17 00:00:00 2001 From: Han Fei Date: Sat, 9 Mar 2024 02:28:39 +0100 Subject: [PATCH 0163/1081] fix style --- programs/keeper/Keeper.cpp | 3 ++- programs/server/Server.cpp | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/programs/keeper/Keeper.cpp b/programs/keeper/Keeper.cpp index c2ad81a3227..31e0b7dc576 100644 --- a/programs/keeper/Keeper.cpp +++ b/programs/keeper/Keeper.cpp @@ -630,7 +630,8 @@ try { observer.emplace(std::chrono::seconds(cgroups_memory_observer_wait_time)); observer->startThread(); - observer->setOnMemoryLimitUpdate([&](){ + observer->setOnMemoryLimitUpdate([&]() + { main_config_reloader->reload(); }); } diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index c81e9b56e35..33a30a1de2c 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1702,9 +1702,12 @@ try } if (cgroups_memory_usage_observer) - cgroups_memory_usage_observer->setOnMemoryLimitUpdate([&](){ + { + cgroups_memory_usage_observer->setOnMemoryLimitUpdate([&]() + { main_config_reloader->reload(); }); + } /// Reload config in SYSTEM RELOAD CONFIG query. global_context->setConfigReloadCallback([&]() From 009c2ea9f6e3c9dcd30a2528f7737cd9059296dd Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 05:27:29 +0100 Subject: [PATCH 0164/1081] Move a class into an anonymous namespace --- src/Disks/IO/ReadBufferFromRemoteFSGather.cpp | 3 --- src/Functions/CastOverloadResolver.cpp | 5 +++++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp index 417f7615dd7..38da4e96ff1 100644 --- a/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp +++ b/src/Disks/IO/ReadBufferFromRemoteFSGather.cpp @@ -1,7 +1,5 @@ #include "ReadBufferFromRemoteFSGather.h" -#include - #include #include #include @@ -9,7 +7,6 @@ #include #include #include -#include #include using namespace DB; diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 7fc46db50f1..a72563212ff 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -12,6 +12,9 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } +namespace +{ + /** CastInternal does not preserve nullability of the data type, * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). * @@ -136,6 +139,8 @@ using CastOverloadResolver = CastOverloadResolverImpl using CastInternalOverloadResolver = CastOverloadResolverImpl; +} + FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) { From 2be09581ddc0ddfb57134f8a0ea6c33314f8071e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 05:48:52 +0100 Subject: [PATCH 0165/1081] Split CastOverloadResolver translation unit --- programs/library-bridge/CMakeLists.txt | 3 + programs/odbc-bridge/CMakeLists.txt | 3 + src/Functions/CastOverloadResolver.cpp | 139 +------------------------ 3 files changed, 7 insertions(+), 138 deletions(-) diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index dd0bf67cb64..f42b574b807 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -11,6 +11,9 @@ set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES LibraryBridgeHandlers.cpp SharedLibrary.cpp library-bridge.cpp + + ../../src/Functions/CastOverloadResolverImpl.cpp + ../../src/Functions/CastInternalOverloadResolverImpl.cpp ) clickhouse_add_executable(clickhouse-library-bridge ${CLICKHOUSE_LIBRARY_BRIDGE_SOURCES}) diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 56373601b95..4e5dbac486e 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -13,6 +13,9 @@ set (CLICKHOUSE_ODBC_BRIDGE_SOURCES getIdentifierQuote.cpp odbc-bridge.cpp validateODBCConnectionString.cpp + + ../../src/Functions/CastOverloadResolverImpl.cpp + ../../src/Functions/CastInternalOverloadResolverImpl.cpp ) clickhouse_add_executable(clickhouse-odbc-bridge ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index a72563212ff..1c57bcfa979 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -1,147 +1,11 @@ #include -#include #include -#include +#include namespace DB { -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} - -namespace -{ - -/** CastInternal does not preserve nullability of the data type, - * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). - * - * Cast preserves nullability according to setting `cast_keep_nullable`, - * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. - */ -template -class CastOverloadResolverImpl : public IFunctionOverloadResolver -{ -public: - using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; - - static constexpr auto name = cast_type == CastType::accurate - ? CastName::accurate_cast_name - : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name); - - String getName() const override { return name; } - - size_t getNumberOfArguments() const override { return 2; } - - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) - : context(context_) - , diagnostic(std::move(diagnostic_)) - , keep_nullable(keep_nullable_) - , data_type_validation_settings(data_type_validation_settings_) - { - } - - static FunctionOverloadResolverPtr create(ContextPtr context) - { - const auto & settings_ref = context->getSettingsRef(); - - if constexpr (internal) - return createImpl(context, {}, false /*keep_nullable*/); - - return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); - } - - static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) - { - assert(!internal || !keep_nullable); - return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings); - } - - static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) - { - assert(!internal || !keep_nullable); - return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings); - } - -protected: - - FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override - { - DataTypes data_types(arguments.size()); - - for (size_t i = 0; i < arguments.size(); ++i) - data_types[i] = arguments[i].type; - - auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); - } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - const auto & column = arguments.back().column; - if (!column) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " - "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName()); - - const auto * type_col = checkAndGetColumnConst(column.get()); - if (!type_col) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " - "Instead there is a column with the following structure: {}", getName(), column->dumpStructure()); - - DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); - validateDataType(type, data_type_validation_settings); - - if constexpr (cast_type == CastType::accurateOrNull) - return makeNullable(type); - - if constexpr (internal) - return type; - - if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) - return makeNullable(type); - - return type; - } - - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForNothing() const override { return false; } - bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } - -private: - ContextPtr context; - std::optional diagnostic; - bool keep_nullable; - DataTypeValidationSettings data_type_validation_settings; -}; - - -struct CastOverloadName -{ - static constexpr auto cast_name = "CAST"; - static constexpr auto accurate_cast_name = "accurateCast"; - static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull"; -}; - -struct CastInternalOverloadName -{ - static constexpr auto cast_name = "_CAST"; - static constexpr auto accurate_cast_name = "accurate_Cast"; - static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull"; -}; - -template -using CastOverloadResolver = CastOverloadResolverImpl; - -template -using CastInternalOverloadResolver = CastOverloadResolverImpl; - -} - - FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) { switch (type) @@ -155,7 +19,6 @@ FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, st } } - REGISTER_FUNCTION(CastOverloadResolvers) { factory.registerFunction>({}, FunctionFactory::CaseInsensitive); From c67e8d5d73a6ac1647a023740a859e739d6ab2ee Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 05:50:19 +0100 Subject: [PATCH 0166/1081] Add files --- .../CastInternalOverloadResolverImpl.cpp | 10 ++ src/Functions/CastOverloadResolverImpl.cpp | 10 ++ src/Functions/CastOverloadResolverImpl.h | 149 ++++++++++++++++++ 3 files changed, 169 insertions(+) create mode 100644 src/Functions/CastInternalOverloadResolverImpl.cpp create mode 100644 src/Functions/CastOverloadResolverImpl.cpp create mode 100644 src/Functions/CastOverloadResolverImpl.h diff --git a/src/Functions/CastInternalOverloadResolverImpl.cpp b/src/Functions/CastInternalOverloadResolverImpl.cpp new file mode 100644 index 00000000000..d8ee0c76fd8 --- /dev/null +++ b/src/Functions/CastInternalOverloadResolverImpl.cpp @@ -0,0 +1,10 @@ +#include + +namespace DB +{ + +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; + +} diff --git a/src/Functions/CastOverloadResolverImpl.cpp b/src/Functions/CastOverloadResolverImpl.cpp new file mode 100644 index 00000000000..d2325db5e0a --- /dev/null +++ b/src/Functions/CastOverloadResolverImpl.cpp @@ -0,0 +1,10 @@ +#include + +namespace DB +{ + +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; + +} diff --git a/src/Functions/CastOverloadResolverImpl.h b/src/Functions/CastOverloadResolverImpl.h new file mode 100644 index 00000000000..61ccc66fb6c --- /dev/null +++ b/src/Functions/CastOverloadResolverImpl.h @@ -0,0 +1,149 @@ +#pragma once + +#include +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + + +/** CastInternal does not preserve nullability of the data type, + * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). + * + * Cast preserves nullability according to setting `cast_keep_nullable`, + * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. + */ +template +class CastOverloadResolverImpl : public IFunctionOverloadResolver +{ +public: + using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; + + static constexpr auto name = cast_type == CastType::accurate + ? CastName::accurate_cast_name + : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name); + + String getName() const override { return name; } + + size_t getNumberOfArguments() const override { return 2; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) + : context(context_) + , diagnostic(std::move(diagnostic_)) + , keep_nullable(keep_nullable_) + , data_type_validation_settings(data_type_validation_settings_) + { + } + + static FunctionOverloadResolverPtr create(ContextPtr context) + { + const auto & settings_ref = context->getSettingsRef(); + + if constexpr (internal) + return createImpl(context, {}, false /*keep_nullable*/); + + return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); + } + + static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) + { + assert(!internal || !keep_nullable); + return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings); + } + + static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) + { + assert(!internal || !keep_nullable); + return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings); + } + +protected: + + FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + DataTypes data_types(arguments.size()); + + for (size_t i = 0; i < arguments.size(); ++i) + data_types[i] = arguments[i].type; + + auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); + return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const auto & column = arguments.back().column; + if (!column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " + "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName()); + + const auto * type_col = checkAndGetColumnConst(column.get()); + if (!type_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " + "Instead there is a column with the following structure: {}", getName(), column->dumpStructure()); + + DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); + validateDataType(type, data_type_validation_settings); + + if constexpr (cast_type == CastType::accurateOrNull) + return makeNullable(type); + + if constexpr (internal) + return type; + + if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) + return makeNullable(type); + + return type; + } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForNothing() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + +private: + ContextPtr context; + std::optional diagnostic; + bool keep_nullable; + DataTypeValidationSettings data_type_validation_settings; +}; + + +struct CastOverloadName +{ + static constexpr auto cast_name = "CAST"; + static constexpr auto accurate_cast_name = "accurateCast"; + static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull"; +}; + +struct CastInternalOverloadName +{ + static constexpr auto cast_name = "_CAST"; + static constexpr auto accurate_cast_name = "accurate_Cast"; + static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull"; +}; + +template +using CastOverloadResolver = CastOverloadResolverImpl; + +template +using CastInternalOverloadResolver = CastOverloadResolverImpl; + +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; + +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; + +} From 6d45eecdad4b737584b9f64e9a44c8fd845f17a4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 06:11:34 +0100 Subject: [PATCH 0167/1081] Remove garbage --- programs/odbc-bridge/CMakeLists.txt | 3 -- src/Functions/CMakeLists.txt | 2 + .../CastInternalOverloadResolverImpl.cpp | 6 +-- src/Functions/CastOverloadResolverImpl.cpp | 6 +-- src/Functions/CastOverloadResolverImpl.h | 51 +++++++++---------- src/Functions/FunctionsConversion.cpp | 15 ++++++ src/Functions/FunctionsConversion.h | 17 +------ 7 files changed, 49 insertions(+), 51 deletions(-) diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 4e5dbac486e..56373601b95 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -13,9 +13,6 @@ set (CLICKHOUSE_ODBC_BRIDGE_SOURCES getIdentifierQuote.cpp odbc-bridge.cpp validateODBCConnectionString.cpp - - ../../src/Functions/CastOverloadResolverImpl.cpp - ../../src/Functions/CastInternalOverloadResolverImpl.cpp ) clickhouse_add_executable(clickhouse-odbc-bridge ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}) diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index ac3e3671ae0..f27bcae1fe3 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -15,6 +15,8 @@ extract_into_parent_list(clickhouse_functions_sources dbms_sources checkHyperscanRegexp.cpp array/has.cpp CastOverloadResolver.cpp + CastOverloadResolverImpl.cpp + CastInternalOverloadResolverImpl.cpp ) extract_into_parent_list(clickhouse_functions_headers dbms_headers IFunction.h diff --git a/src/Functions/CastInternalOverloadResolverImpl.cpp b/src/Functions/CastInternalOverloadResolverImpl.cpp index d8ee0c76fd8..8b74f76ca39 100644 --- a/src/Functions/CastInternalOverloadResolverImpl.cpp +++ b/src/Functions/CastInternalOverloadResolverImpl.cpp @@ -3,8 +3,8 @@ namespace DB { -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; } diff --git a/src/Functions/CastOverloadResolverImpl.cpp b/src/Functions/CastOverloadResolverImpl.cpp index d2325db5e0a..a7f7024892e 100644 --- a/src/Functions/CastOverloadResolverImpl.cpp +++ b/src/Functions/CastOverloadResolverImpl.cpp @@ -3,8 +3,8 @@ namespace DB { -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; +template class CastOverloadResolverImpl; } diff --git a/src/Functions/CastOverloadResolverImpl.h b/src/Functions/CastOverloadResolverImpl.h index 61ccc66fb6c..36376c5f254 100644 --- a/src/Functions/CastOverloadResolverImpl.h +++ b/src/Functions/CastOverloadResolverImpl.h @@ -13,6 +13,15 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } +struct CastName +{ + static constexpr auto name = "CAST"; +}; + +struct CastInternalName +{ + static constexpr auto name = "_CAST"; +}; /** CastInternal does not preserve nullability of the data type, * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). @@ -20,15 +29,16 @@ namespace ErrorCodes * Cast preserves nullability according to setting `cast_keep_nullable`, * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. */ -template +template class CastOverloadResolverImpl : public IFunctionOverloadResolver { public: using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; static constexpr auto name = cast_type == CastType::accurate - ? CastName::accurate_cast_name - : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name); + ? "accurateCast" + : (cast_type == CastType::accurateOrNull ? "accurateCastOrNull" + : (internal ? "_CAST" : "CAST")); String getName() const override { return name; } @@ -76,7 +86,9 @@ protected: data_types[i] = arguments[i].type; auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + + using Function = FunctionCast>; + return std::make_unique(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override @@ -118,32 +130,19 @@ private: }; -struct CastOverloadName -{ - static constexpr auto cast_name = "CAST"; - static constexpr auto accurate_cast_name = "accurateCast"; - static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull"; -}; - -struct CastInternalOverloadName -{ - static constexpr auto cast_name = "_CAST"; - static constexpr auto accurate_cast_name = "accurate_Cast"; - static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull"; -}; +template +using CastOverloadResolver = CastOverloadResolverImpl; template -using CastOverloadResolver = CastOverloadResolverImpl; +using CastInternalOverloadResolver = CastOverloadResolverImpl; -template -using CastInternalOverloadResolver = CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; +extern template class CastOverloadResolverImpl; } diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 01e057e19a1..59455ba51b7 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -5,6 +5,21 @@ namespace DB { +UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) +{ + const auto * arg_type = named_column.type.get(); + bool ok = checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type); + if (!ok) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type of toDecimal() scale {}", named_column.type->getName()); + + Field field; + named_column.column->get(0, field); + return static_cast(field.get()); +} + REGISTER_FUNCTION(Conversion) { factory.registerFunction(); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 1522e76893e..fac74715fa0 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -98,20 +98,7 @@ namespace ErrorCodes * toType - conversion in "natural way"; */ -inline UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) -{ - const auto * arg_type = named_column.type.get(); - bool ok = checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type); - if (!ok) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type of toDecimal() scale {}", named_column.type->getName()); - - Field field; - named_column.column->get(0, field); - return static_cast(field.get()); -} +UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column); /// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; @@ -3182,8 +3169,6 @@ private: std::optional diagnostic; }; -struct CastName { static constexpr auto name = "CAST"; }; -struct CastInternalName { static constexpr auto name = "_CAST"; }; class FunctionCastBase : public IFunctionBase { From ea54ac3cb4842c332529bab14c0e619115911a2c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 06:12:22 +0100 Subject: [PATCH 0168/1081] Remove garbage --- programs/library-bridge/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index f42b574b807..dd0bf67cb64 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -11,9 +11,6 @@ set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES LibraryBridgeHandlers.cpp SharedLibrary.cpp library-bridge.cpp - - ../../src/Functions/CastOverloadResolverImpl.cpp - ../../src/Functions/CastInternalOverloadResolverImpl.cpp ) clickhouse_add_executable(clickhouse-library-bridge ${CLICKHOUSE_LIBRARY_BRIDGE_SOURCES}) From 7983e2b6206c5b78158802430b639e9b060d7b0c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 06:14:32 +0100 Subject: [PATCH 0169/1081] Remove garbage --- src/Functions/CastOverloadResolver.cpp | 15 +++++++++++++++ src/Functions/FunctionsConversion.cpp | 15 --------------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 1c57bcfa979..a343dbb62fe 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -6,6 +6,21 @@ namespace DB { +UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) +{ + const auto * arg_type = named_column.type.get(); + bool ok = checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type); + if (!ok) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type of toDecimal() scale {}", named_column.type->getName()); + + Field field; + named_column.column->get(0, field); + return static_cast(field.get()); +} + FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) { switch (type) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 59455ba51b7..01e057e19a1 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -5,21 +5,6 @@ namespace DB { -UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) -{ - const auto * arg_type = named_column.type.get(); - bool ok = checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type); - if (!ok) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type of toDecimal() scale {}", named_column.type->getName()); - - Field field; - named_column.column->get(0, field); - return static_cast(field.get()); -} - REGISTER_FUNCTION(Conversion) { factory.registerFunction(); From 0d05b8ccc1f56d774c502f780cbbd98326dd8401 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 06:37:50 +0100 Subject: [PATCH 0170/1081] Fix style --- src/Functions/CastOverloadResolver.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index a343dbb62fe..6bacc7f4847 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -6,6 +6,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int ILLEGAL_TYPE_OF_ARGUMENT; +} + UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) { const auto * arg_type = named_column.type.get(); From dc7f4b39eef4c2a9b47cee7b7197c58fd86c0520 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 07:00:28 +0100 Subject: [PATCH 0171/1081] Remove garbage --- src/Functions/CMakeLists.txt | 2 - .../CastInternalOverloadResolverImpl.cpp | 10 --- src/Functions/CastOverloadResolver.cpp | 21 +----- src/Functions/CastOverloadResolverImpl.cpp | 10 --- src/Functions/CastOverloadResolverImpl.h | 73 +++++++------------ 5 files changed, 31 insertions(+), 85 deletions(-) delete mode 100644 src/Functions/CastInternalOverloadResolverImpl.cpp delete mode 100644 src/Functions/CastOverloadResolverImpl.cpp diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt index f27bcae1fe3..ac3e3671ae0 100644 --- a/src/Functions/CMakeLists.txt +++ b/src/Functions/CMakeLists.txt @@ -15,8 +15,6 @@ extract_into_parent_list(clickhouse_functions_sources dbms_sources checkHyperscanRegexp.cpp array/has.cpp CastOverloadResolver.cpp - CastOverloadResolverImpl.cpp - CastInternalOverloadResolverImpl.cpp ) extract_into_parent_list(clickhouse_functions_headers dbms_headers IFunction.h diff --git a/src/Functions/CastInternalOverloadResolverImpl.cpp b/src/Functions/CastInternalOverloadResolverImpl.cpp deleted file mode 100644 index 8b74f76ca39..00000000000 --- a/src/Functions/CastInternalOverloadResolverImpl.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include - -namespace DB -{ - -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; - -} diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 6bacc7f4847..79b17d3297c 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -26,27 +26,14 @@ UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) return static_cast(field.get()); } -FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) -{ - switch (type) - { - case CastType::nonAccurate: - return CastInternalOverloadResolver::createImpl(diagnostic); - case CastType::accurate: - return CastInternalOverloadResolver::createImpl(diagnostic); - case CastType::accurateOrNull: - return CastInternalOverloadResolver::createImpl(diagnostic); - } -} - REGISTER_FUNCTION(CastOverloadResolvers) { - factory.registerFunction>({}, FunctionFactory::CaseInsensitive); + factory.registerFunction("_CAST", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::nonAccurate, true); }, {}, FunctionFactory::CaseInsensitive); /// Note: "internal" (not affected by null preserving setting) versions of accurate cast functions are unneeded. - factory.registerFunction>({}, FunctionFactory::CaseInsensitive); - factory.registerFunction>(); - factory.registerFunction>(); + factory.registerFunction("CAST", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::nonAccurate, false); }, {}, FunctionFactory::CaseInsensitive); + factory.registerFunction("accurateCast", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::accurate, false); }, {}); + factory.registerFunction("accurateCastOrNull", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::accurateOrNull, false); }, {}); } } diff --git a/src/Functions/CastOverloadResolverImpl.cpp b/src/Functions/CastOverloadResolverImpl.cpp deleted file mode 100644 index a7f7024892e..00000000000 --- a/src/Functions/CastOverloadResolverImpl.cpp +++ /dev/null @@ -1,10 +0,0 @@ -#include - -namespace DB -{ - -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; -template class CastOverloadResolverImpl; - -} diff --git a/src/Functions/CastOverloadResolverImpl.h b/src/Functions/CastOverloadResolverImpl.h index 36376c5f254..b3f3e50ebf8 100644 --- a/src/Functions/CastOverloadResolverImpl.h +++ b/src/Functions/CastOverloadResolverImpl.h @@ -29,55 +29,48 @@ struct CastInternalName * Cast preserves nullability according to setting `cast_keep_nullable`, * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. */ -template class CastOverloadResolverImpl : public IFunctionOverloadResolver { public: using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; - static constexpr auto name = cast_type == CastType::accurate - ? "accurateCast" - : (cast_type == CastType::accurateOrNull ? "accurateCastOrNull" - : (internal ? "_CAST" : "CAST")); - - String getName() const override { return name; } + String getName() const override + { + if (cast_type == CastType::accurate) + return "accurateCast"; + if (cast_type == CastType::accurateOrNull) + return "accurateCastOrNull"; + if (internal) + return "_CAST"; + else + return "CAST"; + } size_t getNumberOfArguments() const override { return 2; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) + explicit CastOverloadResolverImpl(ContextPtr context_, CastType cast_type_, bool internal_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) : context(context_) + , cast_type(cast_type_) + , internal(internal_) , diagnostic(std::move(diagnostic_)) , keep_nullable(keep_nullable_) , data_type_validation_settings(data_type_validation_settings_) { } - static FunctionOverloadResolverPtr create(ContextPtr context) + static FunctionOverloadResolverPtr create(ContextPtr context, CastType cast_type, bool internal) { const auto & settings_ref = context->getSettingsRef(); - if constexpr (internal) - return createImpl(context, {}, false /*keep_nullable*/); - - return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); - } - - static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) - { - assert(!internal || !keep_nullable); - return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings); - } - - static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {}) - { - assert(!internal || !keep_nullable); - return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings); + if (internal) + return std::make_unique(context, cast_type, internal, std::nullopt, false /*keep_nullable*/, DataTypeValidationSettings{}); + else + return std::make_unique(context, cast_type, internal, std::nullopt, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); } protected: - FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override { DataTypes data_types(arguments.size()); @@ -87,8 +80,10 @@ protected: auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - using Function = FunctionCast>; - return std::make_unique(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + if (internal) + return std::make_unique>(context, CastInternalName::name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + else + return std::make_unique>(context, CastName::name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override @@ -106,10 +101,10 @@ protected: DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); validateDataType(type, data_type_validation_settings); - if constexpr (cast_type == CastType::accurateOrNull) + if (cast_type == CastType::accurateOrNull) return makeNullable(type); - if constexpr (internal) + if (internal) return type; if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) @@ -124,25 +119,11 @@ protected: private: ContextPtr context; + CastType cast_type; + bool internal; std::optional diagnostic; bool keep_nullable; DataTypeValidationSettings data_type_validation_settings; }; - -template -using CastOverloadResolver = CastOverloadResolverImpl; - -template -using CastInternalOverloadResolver = CastOverloadResolverImpl; - - -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; - -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; -extern template class CastOverloadResolverImpl; - } From 83a5611355fdb7bf6ad95a8ae6b3e60321e00c69 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 07:04:05 +0100 Subject: [PATCH 0172/1081] Remove garbage --- src/Functions/CastOverloadResolver.cpp | 13 +++++++++---- src/Functions/CastOverloadResolverImpl.h | 6 +++--- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 79b17d3297c..898842c8505 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -26,14 +26,19 @@ UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) return static_cast(field.get()); } +FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) +{ + return CastOverloadResolverImpl::create(ContextPtr{}, type, true, diagnostic); +} + REGISTER_FUNCTION(CastOverloadResolvers) { - factory.registerFunction("_CAST", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::nonAccurate, true); }, {}, FunctionFactory::CaseInsensitive); + factory.registerFunction("_CAST", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::nonAccurate, true, {}); }, {}, FunctionFactory::CaseInsensitive); /// Note: "internal" (not affected by null preserving setting) versions of accurate cast functions are unneeded. - factory.registerFunction("CAST", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::nonAccurate, false); }, {}, FunctionFactory::CaseInsensitive); - factory.registerFunction("accurateCast", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::accurate, false); }, {}); - factory.registerFunction("accurateCastOrNull", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::accurateOrNull, false); }, {}); + factory.registerFunction("CAST", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::nonAccurate, false, {}); }, {}, FunctionFactory::CaseInsensitive); + factory.registerFunction("accurateCast", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::accurate, false, {}); }, {}); + factory.registerFunction("accurateCastOrNull", [](ContextPtr context){ return CastOverloadResolverImpl::create(context, CastType::accurateOrNull, false, {}); }, {}); } } diff --git a/src/Functions/CastOverloadResolverImpl.h b/src/Functions/CastOverloadResolverImpl.h index b3f3e50ebf8..74b3fd3df3f 100644 --- a/src/Functions/CastOverloadResolverImpl.h +++ b/src/Functions/CastOverloadResolverImpl.h @@ -60,14 +60,14 @@ public: { } - static FunctionOverloadResolverPtr create(ContextPtr context, CastType cast_type, bool internal) + static FunctionOverloadResolverPtr create(ContextPtr context, CastType cast_type, bool internal, std::optional diagnostic) { const auto & settings_ref = context->getSettingsRef(); if (internal) - return std::make_unique(context, cast_type, internal, std::nullopt, false /*keep_nullable*/, DataTypeValidationSettings{}); + return std::make_unique(context, cast_type, internal, diagnostic, false /*keep_nullable*/, DataTypeValidationSettings{}); else - return std::make_unique(context, cast_type, internal, std::nullopt, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); + return std::make_unique(context, cast_type, internal, diagnostic, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); } protected: From 574d4863224ed7343db64f976f53442f1d166e4a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 07:55:59 +0100 Subject: [PATCH 0173/1081] Something --- programs/library-bridge/CMakeLists.txt | 1 + .../library-bridge/createFunctionBaseCast.cpp | 19 +++++++++++++ programs/odbc-bridge/CMakeLists.txt | 1 + .../odbc-bridge/createFunctionBaseCast.cpp | 19 +++++++++++++ src/Functions/CastOverloadResolver.cpp | 1 + src/Functions/CastOverloadResolverImpl.h | 28 ++++--------------- src/Functions/FunctionsConversion.cpp | 23 +++++++++++++++ src/Functions/FunctionsConversion.h | 7 +++++ 8 files changed, 77 insertions(+), 22 deletions(-) create mode 100644 programs/library-bridge/createFunctionBaseCast.cpp create mode 100644 programs/odbc-bridge/createFunctionBaseCast.cpp diff --git a/programs/library-bridge/CMakeLists.txt b/programs/library-bridge/CMakeLists.txt index dd0bf67cb64..98d8848502d 100644 --- a/programs/library-bridge/CMakeLists.txt +++ b/programs/library-bridge/CMakeLists.txt @@ -11,6 +11,7 @@ set (CLICKHOUSE_LIBRARY_BRIDGE_SOURCES LibraryBridgeHandlers.cpp SharedLibrary.cpp library-bridge.cpp + createFunctionBaseCast.cpp ) clickhouse_add_executable(clickhouse-library-bridge ${CLICKHOUSE_LIBRARY_BRIDGE_SOURCES}) diff --git a/programs/library-bridge/createFunctionBaseCast.cpp b/programs/library-bridge/createFunctionBaseCast.cpp new file mode 100644 index 00000000000..473aa1ca81d --- /dev/null +++ b/programs/library-bridge/createFunctionBaseCast.cpp @@ -0,0 +1,19 @@ +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for ODBC Bridge"); +} + +} diff --git a/programs/odbc-bridge/CMakeLists.txt b/programs/odbc-bridge/CMakeLists.txt index 56373601b95..18cda4d7a04 100644 --- a/programs/odbc-bridge/CMakeLists.txt +++ b/programs/odbc-bridge/CMakeLists.txt @@ -13,6 +13,7 @@ set (CLICKHOUSE_ODBC_BRIDGE_SOURCES getIdentifierQuote.cpp odbc-bridge.cpp validateODBCConnectionString.cpp + createFunctionBaseCast.cpp ) clickhouse_add_executable(clickhouse-odbc-bridge ${CLICKHOUSE_ODBC_BRIDGE_SOURCES}) diff --git a/programs/odbc-bridge/createFunctionBaseCast.cpp b/programs/odbc-bridge/createFunctionBaseCast.cpp new file mode 100644 index 00000000000..473aa1ca81d --- /dev/null +++ b/programs/odbc-bridge/createFunctionBaseCast.cpp @@ -0,0 +1,19 @@ +#include +#include + + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + +FunctionBasePtr createFunctionBaseCast( + ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) +{ + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for ODBC Bridge"); +} + +} diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 898842c8505..6ab6f95410f 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -1,6 +1,7 @@ #include #include #include +#include namespace DB diff --git a/src/Functions/CastOverloadResolverImpl.h b/src/Functions/CastOverloadResolverImpl.h index 74b3fd3df3f..2bb83040163 100644 --- a/src/Functions/CastOverloadResolverImpl.h +++ b/src/Functions/CastOverloadResolverImpl.h @@ -1,7 +1,12 @@ #pragma once #include +#include #include +#include +#include +#include +#include #include @@ -13,15 +18,6 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } -struct CastName -{ - static constexpr auto name = "CAST"; -}; - -struct CastInternalName -{ - static constexpr auto name = "_CAST"; -}; /** CastInternal does not preserve nullability of the data type, * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). @@ -32,8 +28,6 @@ struct CastInternalName class CastOverloadResolverImpl : public IFunctionOverloadResolver { public: - using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; - String getName() const override { if (cast_type == CastType::accurate) @@ -73,17 +67,7 @@ public: protected: FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override { - DataTypes data_types(arguments.size()); - - for (size_t i = 0; i < arguments.size(); ++i) - data_types[i] = arguments[i].type; - - auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - - if (internal) - return std::make_unique>(context, CastInternalName::name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); - else - return std::make_unique>(context, CastName::name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + return createFunctionBaseCast(context, arguments, return_type, diagnostic, cast_type); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 01e057e19a1..ce9e4fd66e1 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -5,6 +5,29 @@ namespace DB { +namespace +{ + +struct CastInternalName { static constexpr auto name = "_CAST"; }; + +} + +FunctionBasePtr createFunctionBaseCast( + ContextPtr context + , const ColumnsWithTypeAndName & arguments + , const DataTypePtr & return_type + , std::optional diagnostic + , CastType cast_type) +{ + DataTypes data_types(arguments.size()); + + for (size_t i = 0; i < arguments.size(); ++i) + data_types[i] = arguments[i].type; + + auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); + return std::make_unique>(context, CastInternalName::name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); +} + REGISTER_FUNCTION(Conversion) { factory.registerFunction(); diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index fac74715fa0..689b72dc917 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -4972,4 +4972,11 @@ public: } }; +FunctionBasePtr createFunctionBaseCast( + ContextPtr context + , const ColumnsWithTypeAndName & arguments + , const DataTypePtr & return_type + , std::optional diagnostic + , CastType cast_type); + } From feed74e598503eeaa0e6aac44684f19e1fd42361 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 07:59:09 +0100 Subject: [PATCH 0174/1081] Better --- src/Functions/CastOverloadResolver.cpp | 100 +++++++++++++++++++- src/Functions/CastOverloadResolverImpl.h | 113 ----------------------- 2 files changed, 99 insertions(+), 114 deletions(-) delete mode 100644 src/Functions/CastOverloadResolverImpl.h diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 6ab6f95410f..98debc3e2a4 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -1,7 +1,12 @@ #include #include -#include +#include +#include +#include #include +#include +#include +#include namespace DB @@ -12,6 +17,99 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } + +/** CastInternal does not preserve nullability of the data type, + * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). + * + * Cast preserves nullability according to setting `cast_keep_nullable`, + * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. + */ +class CastOverloadResolverImpl : public IFunctionOverloadResolver +{ +public: + String getName() const override + { + if (cast_type == CastType::accurate) + return "accurateCast"; + if (cast_type == CastType::accurateOrNull) + return "accurateCastOrNull"; + if (internal) + return "_CAST"; + else + return "CAST"; + } + + size_t getNumberOfArguments() const override { return 2; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + explicit CastOverloadResolverImpl(ContextPtr context_, CastType cast_type_, bool internal_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) + : context(context_) + , cast_type(cast_type_) + , internal(internal_) + , diagnostic(std::move(diagnostic_)) + , keep_nullable(keep_nullable_) + , data_type_validation_settings(data_type_validation_settings_) + { + } + + static FunctionOverloadResolverPtr create(ContextPtr context, CastType cast_type, bool internal, std::optional diagnostic) + { + const auto & settings_ref = context->getSettingsRef(); + + if (internal) + return std::make_unique(context, cast_type, internal, diagnostic, false /*keep_nullable*/, DataTypeValidationSettings{}); + else + return std::make_unique(context, cast_type, internal, diagnostic, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); + } + +protected: + FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override + { + return createFunctionBaseCast(context, arguments, return_type, diagnostic, cast_type); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + const auto & column = arguments.back().column; + if (!column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " + "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName()); + + const auto * type_col = checkAndGetColumnConst(column.get()); + if (!type_col) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " + "Instead there is a column with the following structure: {}", getName(), column->dumpStructure()); + + DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); + validateDataType(type, data_type_validation_settings); + + if (cast_type == CastType::accurateOrNull) + return makeNullable(type); + + if (internal) + return type; + + if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) + return makeNullable(type); + + return type; + } + + bool useDefaultImplementationForNulls() const override { return false; } + bool useDefaultImplementationForNothing() const override { return false; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + +private: + ContextPtr context; + CastType cast_type; + bool internal; + std::optional diagnostic; + bool keep_nullable; + DataTypeValidationSettings data_type_validation_settings; +}; + + UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) { const auto * arg_type = named_column.type.get(); diff --git a/src/Functions/CastOverloadResolverImpl.h b/src/Functions/CastOverloadResolverImpl.h deleted file mode 100644 index 2bb83040163..00000000000 --- a/src/Functions/CastOverloadResolverImpl.h +++ /dev/null @@ -1,113 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ILLEGAL_TYPE_OF_ARGUMENT; -} - - -/** CastInternal does not preserve nullability of the data type, - * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). - * - * Cast preserves nullability according to setting `cast_keep_nullable`, - * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1. - */ -class CastOverloadResolverImpl : public IFunctionOverloadResolver -{ -public: - String getName() const override - { - if (cast_type == CastType::accurate) - return "accurateCast"; - if (cast_type == CastType::accurateOrNull) - return "accurateCastOrNull"; - if (internal) - return "_CAST"; - else - return "CAST"; - } - - size_t getNumberOfArguments() const override { return 2; } - - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - explicit CastOverloadResolverImpl(ContextPtr context_, CastType cast_type_, bool internal_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_) - : context(context_) - , cast_type(cast_type_) - , internal(internal_) - , diagnostic(std::move(diagnostic_)) - , keep_nullable(keep_nullable_) - , data_type_validation_settings(data_type_validation_settings_) - { - } - - static FunctionOverloadResolverPtr create(ContextPtr context, CastType cast_type, bool internal, std::optional diagnostic) - { - const auto & settings_ref = context->getSettingsRef(); - - if (internal) - return std::make_unique(context, cast_type, internal, diagnostic, false /*keep_nullable*/, DataTypeValidationSettings{}); - else - return std::make_unique(context, cast_type, internal, diagnostic, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); - } - -protected: - FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override - { - return createFunctionBaseCast(context, arguments, return_type, diagnostic, cast_type); - } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - const auto & column = arguments.back().column; - if (!column) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " - "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName()); - - const auto * type_col = checkAndGetColumnConst(column.get()); - if (!type_col) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. " - "Instead there is a column with the following structure: {}", getName(), column->dumpStructure()); - - DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue()); - validateDataType(type, data_type_validation_settings); - - if (cast_type == CastType::accurateOrNull) - return makeNullable(type); - - if (internal) - return type; - - if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable()) - return makeNullable(type); - - return type; - } - - bool useDefaultImplementationForNulls() const override { return false; } - bool useDefaultImplementationForNothing() const override { return false; } - bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } - -private: - ContextPtr context; - CastType cast_type; - bool internal; - std::optional diagnostic; - bool keep_nullable; - DataTypeValidationSettings data_type_validation_settings; -}; - -} From 061cee257c489eccdcbcda7e694937fa93444d18 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 08:10:59 +0100 Subject: [PATCH 0175/1081] Remove crap --- src/Functions/CastOverloadResolver.cpp | 9 +++- src/Functions/FunctionsConversion.cpp | 59 +++++++++++++++++--- src/Functions/FunctionsConversion.h | 75 ++++++-------------------- src/Functions/concat.cpp | 2 +- 4 files changed, 76 insertions(+), 69 deletions(-) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 98debc3e2a4..4a081d684f6 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -1,12 +1,12 @@ #include #include -#include #include #include #include #include #include #include +#include namespace DB @@ -17,6 +17,13 @@ namespace ErrorCodes extern const int ILLEGAL_TYPE_OF_ARGUMENT; } +FunctionBasePtr createFunctionBaseCast( + ContextPtr context + , const ColumnsWithTypeAndName & arguments + , const DataTypePtr & return_type + , std::optional diagnostic + , CastType cast_type); + /** CastInternal does not preserve nullability of the data type, * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1). diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index ce9e4fd66e1..ebb63f1b25d 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -5,13 +5,6 @@ namespace DB { -namespace -{ - -struct CastInternalName { static constexpr auto name = "_CAST"; }; - -} - FunctionBasePtr createFunctionBaseCast( ContextPtr context , const ColumnsWithTypeAndName & arguments @@ -25,7 +18,7 @@ FunctionBasePtr createFunctionBaseCast( data_types[i] = arguments[i].type; auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - return std::make_unique>(context, CastInternalName::name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + return std::make_unique(context, "CAST", std::move(monotonicity), data_types, return_type, diagnostic, cast_type); } REGISTER_FUNCTION(Conversion) @@ -153,4 +146,54 @@ REGISTER_FUNCTION(Conversion) factory.registerFunction>(); } + +MonotonicityHelper::MonotonicityForRange MonotonicityHelper::getMonotonicityInformation(const DataTypePtr & from_type, const IDataType * to_type) +{ + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (isEnum(from_type)) + { + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + } + /// other types like Null, FixedString, Array and Tuple have no monotonicity defined + return {}; +} + } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 689b72dc917..0c4b7b41b93 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -3170,13 +3170,17 @@ private: }; +struct FunctionCastName +{ + static constexpr auto name = "CAST"; +}; + class FunctionCastBase : public IFunctionBase { public: using MonotonicityForRange = std::function; }; -template class FunctionCast final : public FunctionCastBase { public: @@ -3204,7 +3208,7 @@ public: try { return std::make_unique( - prepareUnpackDictionaries(getArgumentTypes()[0], getResultType()), cast_name, diagnostic); + prepareUnpackDictionaries(getArgumentTypes()[0], getResultType()), cast_name, diagnostic); } catch (Exception & e) { @@ -3278,7 +3282,7 @@ private: { /// In case when converting to Nullable type, we apply different parsing rule, /// that will not throw an exception but return NULL in case of malformed input. - FunctionPtr function = FunctionConvertFromString::create(); + FunctionPtr function = FunctionConvertFromString::create(); return createFunctionAdaptor(function, from_type); } else if (!can_apply_accurate_cast) @@ -3304,7 +3308,7 @@ private: { #define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE, ADDITIONS) \ case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ - result_column = ConvertImpl::execute( \ + result_column = ConvertImpl::execute( \ arguments, result_type, input_rows_count, ADDITIONS()); \ break; if (wrapper_cast_type == CastType::accurate) @@ -3334,7 +3338,7 @@ private: { #define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE, ADDITIONS) \ case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ - result_column = ConvertImpl::template execute( \ + result_column = ConvertImpl::template execute( \ arguments, result_type, input_rows_count); \ break; if (wrapper_cast_type == CastType::accurate) @@ -3368,7 +3372,7 @@ arguments, result_type, input_rows_count); \ { if (wrapper_cast_type == CastType::accurateOrNull) { - auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); + auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); return nullable_column_wrapper(arguments, result_type, column_nullable, input_rows_count); } else @@ -3494,7 +3498,7 @@ arguments, result_type, input_rows_count); \ { AccurateConvertStrategyAdditions additions; additions.scale = scale; - result_column = ConvertImpl::execute( + result_column = ConvertImpl::execute( arguments, result_type, input_rows_count, additions); return true; @@ -3503,7 +3507,7 @@ arguments, result_type, input_rows_count); \ { AccurateOrNullConvertStrategyAdditions additions; additions.scale = scale; - result_column = ConvertImpl::execute( + result_column = ConvertImpl::execute( arguments, result_type, input_rows_count, additions); return true; @@ -3516,14 +3520,14 @@ arguments, result_type, input_rows_count); \ /// Consistent with CAST(Nullable(String) AS Nullable(Numbers)) /// In case when converting to Nullable type, we apply different parsing rule, /// that will not throw an exception but return NULL in case of malformed input. - result_column = ConvertImpl::execute( + result_column = ConvertImpl::execute( arguments, result_type, input_rows_count, scale); return true; } } - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); return true; }); @@ -3533,7 +3537,7 @@ arguments, result_type, input_rows_count); \ { if (wrapper_cast_type == CastType::accurateOrNull) { - auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); + auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); return nullable_column_wrapper(arguments, result_type, column_nullable, input_rows_count); } else @@ -4922,54 +4926,7 @@ public: return FunctionTo::Type::Monotonic::get; } - static MonotonicityForRange getMonotonicityInformation(const DataTypePtr & from_type, const IDataType * to_type) - { - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (isEnum(from_type)) - { - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - } - /// other types like Null, FixedString, Array and Tuple have no monotonicity defined - return {}; - } + static MonotonicityForRange getMonotonicityInformation(const DataTypePtr & from_type, const IDataType * to_type); }; FunctionBasePtr createFunctionBaseCast( diff --git a/src/Functions/concat.cpp b/src/Functions/concat.cpp index d68f5256f6d..c75a806559c 100644 --- a/src/Functions/concat.cpp +++ b/src/Functions/concat.cpp @@ -1,8 +1,8 @@ #include +#include #include #include #include -#include #include #include #include From 804c07156d90c0b4ce6b632f30b5481077d0f5d3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 08:24:53 +0100 Subject: [PATCH 0176/1081] Fix garbage --- .../Serializations/SerializationObject.cpp | 20 +++- src/Functions/FunctionsConversion.h | 97 ++++++++----------- 2 files changed, 57 insertions(+), 60 deletions(-) diff --git a/src/DataTypes/Serializations/SerializationObject.cpp b/src/DataTypes/Serializations/SerializationObject.cpp index e6dc16ef5a0..cd186ec2c46 100644 --- a/src/DataTypes/Serializations/SerializationObject.cpp +++ b/src/DataTypes/Serializations/SerializationObject.cpp @@ -5,13 +5,11 @@ #include #include #include -#include #include #include -#include #include +#include #include -#include #include #include @@ -30,6 +28,7 @@ namespace ErrorCodes extern const int CANNOT_READ_ALL_DATA; extern const int ARGUMENT_OUT_OF_BOUND; extern const int LOGICAL_ERROR; + extern const int CANNOT_PARSE_TEXT; } template @@ -344,7 +343,20 @@ void SerializationObject::deserializeBinaryBulkFromString( state.nested_serialization->deserializeBinaryBulkWithMultipleStreams( column_string, limit, settings, state.nested_state, cache); - ConvertImplGenericFromString::executeImpl(*column_string, column_object, *this, column_string->size()); + size_t input_rows_count = column_string->size(); + column_object.reserve(input_rows_count); + + FormatSettings format_settings; + for (size_t i = 0; i < input_rows_count; ++i) + { + const auto & val = column_string->getDataAt(i); + ReadBufferFromMemory read_buffer(val.data, val.size); + deserializeWholeText(column_object, read_buffer, format_settings); + + if (!read_buffer.eof()) + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, + "Cannot parse string to column Object. Expected eof"); + } } template diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 0c4b7b41b93..c21e85fb40e 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -1818,14 +1818,10 @@ struct ConvertImpl {}; /// Generic conversion of any type from String. Used for complex types: Array and Tuple or types with custom serialization. -template struct ConvertImplGenericFromString { static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) { - static_assert(std::is_same_v || std::is_same_v, - "Can be used only to parse from ColumnString or ColumnFixedString"); - const IColumn & column_from = *arguments[0].column; const IDataType & data_type_to = *result_type; auto res = data_type_to.createColumn(); @@ -1841,63 +1837,52 @@ struct ConvertImplGenericFromString IColumn & column_to, const ISerialization & serialization_from, size_t input_rows_count, - const PaddedPODArray * null_map = nullptr, - const IDataType * result_type = nullptr) + const PaddedPODArray * null_map, + const IDataType * result_type) { - static_assert(std::is_same_v || std::is_same_v, - "Can be used only to parse from ColumnString or ColumnFixedString"); + column_to.reserve(input_rows_count); - if (const StringColumnType * col_from_string = checkAndGetColumn(&column_from)) + FormatSettings format_settings; + for (size_t i = 0; i < input_rows_count; ++i) { - column_to.reserve(input_rows_count); - - FormatSettings format_settings; - for (size_t i = 0; i < input_rows_count; ++i) + if (null_map && (*null_map)[i]) { - if (null_map && (*null_map)[i]) + column_to.insertDefault(); + continue; + } + + const auto & val = column_from.getDataAt(i); + ReadBufferFromMemory read_buffer(val.data, val.size); + try + { + serialization_from.deserializeWholeText(column_to, read_buffer, format_settings); + } + catch (const Exception & e) + { + auto * nullable_column = typeid_cast(&column_to); + if (e.code() == ErrorCodes::CANNOT_PARSE_BOOL && nullable_column) { - column_to.insertDefault(); + auto & col_nullmap = nullable_column->getNullMapData(); + if (col_nullmap.size() != nullable_column->size()) + col_nullmap.resize_fill(nullable_column->size()); + if (nullable_column->size() == (i + 1)) + nullable_column->popBack(1); + nullable_column->insertDefault(); continue; } + throw; + } - const auto & val = col_from_string->getDataAt(i); - ReadBufferFromMemory read_buffer(val.data, val.size); - try - { - serialization_from.deserializeWholeText(column_to, read_buffer, format_settings); - } - catch (const Exception & e) - { - auto * nullable_column = typeid_cast(&column_to); - if (e.code() == ErrorCodes::CANNOT_PARSE_BOOL && nullable_column) - { - auto & col_nullmap = nullable_column->getNullMapData(); - if (col_nullmap.size() != nullable_column->size()) - col_nullmap.resize_fill(nullable_column->size()); - if (nullable_column->size() == (i + 1)) - nullable_column->popBack(1); - nullable_column->insertDefault(); - continue; - } - throw; - } - - if (!read_buffer.eof()) - { - if (result_type) - throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); - else - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, - "Cannot parse string to column {}. Expected eof", column_to.getName()); - } + if (!read_buffer.eof()) + { + if (result_type) + throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); + else + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, + "Cannot parse string to column {}. Expected eof", column_to.getName()); } } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, - "Illegal column {} of first argument of conversion function from string", - column_from.getName()); } - }; @@ -3392,7 +3377,7 @@ arguments, result_type, input_rows_count); \ { if (checkAndGetDataType(from_type.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } return createWrapper(from_type, to_type, requested_result_is_nullable); @@ -3555,7 +3540,7 @@ arguments, result_type, input_rows_count); \ /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } else if (const auto * agg_type = checkAndGetDataType(from_type_untyped.get())) { @@ -3598,7 +3583,7 @@ arguments, result_type, input_rows_count); \ /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } DataTypePtr from_type_holder; @@ -3689,7 +3674,7 @@ arguments, result_type, input_rows_count); \ /// Conversion from String through parsing. if (checkAndGetDataType(from_type_untyped.get())) { - return &ConvertImplGenericFromString::execute; + return &ConvertImplGenericFromString::execute; } const auto * from_type = checkAndGetDataType(from_type_untyped.get()); @@ -4034,7 +4019,7 @@ arguments, result_type, input_rows_count); \ { return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) { - auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); res->finalize(); return res; }; @@ -4831,7 +4816,7 @@ arguments, result_type, input_rows_count); \ auto wrapped_result_type = result_type; if (requested_result_is_nullable) wrapped_result_type = makeNullable(result_type); - return ConvertImplGenericFromString::execute( + return ConvertImplGenericFromString::execute( arguments, wrapped_result_type, column_nullable, input_rows_count); }; return true; From f2c4a5bb94ea319b38caf35ac90a9b9208cdc745 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 08:30:13 +0100 Subject: [PATCH 0177/1081] Prevent garbage from appearing --- src/Functions/FunctionsConversion.cpp | 5008 ++++++++++++++++++++++++- src/Functions/FunctionsConversion.h | 4924 ------------------------ 2 files changed, 4957 insertions(+), 4975 deletions(-) delete mode 100644 src/Functions/FunctionsConversion.h diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index ebb63f1b25d..865f7db8e12 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -1,10 +1,4966 @@ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include namespace DB { +namespace ErrorCodes +{ + extern const int ATTEMPT_TO_READ_AFTER_EOF; + extern const int CANNOT_PARSE_NUMBER; + extern const int CANNOT_READ_ARRAY_FROM_TEXT; + extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; + extern const int CANNOT_PARSE_QUOTED_STRING; + extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; + extern const int CANNOT_PARSE_DATE; + extern const int CANNOT_PARSE_DATETIME; + extern const int CANNOT_PARSE_TEXT; + extern const int CANNOT_PARSE_UUID; + extern const int CANNOT_PARSE_IPV4; + extern const int CANNOT_PARSE_IPV6; + extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; + extern const int LOGICAL_ERROR; + extern const int TYPE_MISMATCH; + extern const int CANNOT_CONVERT_TYPE; + extern const int ILLEGAL_COLUMN; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int NOT_IMPLEMENTED; + extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN; + extern const int CANNOT_PARSE_BOOL; + extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE; +} + +/** Type conversion functions. + * toType - conversion in "natural way"; + */ + +UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column); + +/// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. +struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; + +struct AccurateConvertStrategyAdditions +{ + UInt32 scale { 0 }; +}; + +struct AccurateOrNullConvertStrategyAdditions +{ + UInt32 scale { 0 }; +}; + + +struct ConvertDefaultBehaviorTag {}; +struct ConvertReturnNullOnErrorTag {}; +struct ConvertReturnZeroOnErrorTag {}; + +/** Conversion of number types to each other, enums to numbers, dates and datetimes to numbers and back: done by straight assignment. + * (Date is represented internally as number of days from some day; DateTime - as unix timestamp) + */ +template +struct ConvertImpl +{ + using FromFieldType = typename FromDataType::FieldType; + using ToFieldType = typename ToDataType::FieldType; + + template + static ColumnPtr NO_SANITIZE_UNDEFINED execute( + const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type [[maybe_unused]], size_t input_rows_count, + Additions additions [[maybe_unused]] = Additions()) + { + const ColumnWithTypeAndName & named_from = arguments[0]; + + using ColVecFrom = typename FromDataType::ColumnType; + using ColVecTo = typename ToDataType::ColumnType; + + if constexpr ((IsDataTypeDecimal || IsDataTypeDecimal) + && !(std::is_same_v || std::is_same_v)) + { + if constexpr (!IsDataTypeDecimalOrNumber || !IsDataTypeDecimalOrNumber) + { + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + named_from.column->getName(), Name::name); + } + } + + if (const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get())) + { + typename ColVecTo::MutablePtr col_to = nullptr; + + if constexpr (IsDataTypeDecimal) + { + UInt32 scale; + + if constexpr (std::is_same_v + || std::is_same_v) + { + scale = additions.scale; + } + else + { + scale = additions; + } + + col_to = ColVecTo::create(0, scale); + } + else + col_to = ColVecTo::create(); + + const auto & vec_from = col_from->getData(); + auto & vec_to = col_to->getData(); + vec_to.resize(input_rows_count); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; + if constexpr (std::is_same_v) + { + col_null_map_to = ColumnUInt8::create(input_rows_count, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + bool result_is_bool = isBool(result_type); + for (size_t i = 0; i < input_rows_count; ++i) + { + if constexpr (std::is_same_v) + { + if (result_is_bool) + { + vec_to[i] = vec_from[i] != FromFieldType(0); + continue; + } + } + + if constexpr (std::is_same_v && std::is_same_v) + { + static_assert( + std::is_same_v, + "UInt128 and UUID types must be same"); + + vec_to[i].items[1] = vec_from[i].toUnderType().items[0]; + vec_to[i].items[0] = vec_from[i].toUnderType().items[1]; + + continue; + } + + if constexpr (std::is_same_v && std::is_same_v) + { + static_assert( + std::is_same_v, + "UInt128 and IPv6 types must be same"); + + vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]); + vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]); + + continue; + } + + if constexpr (std::is_same_v != std::is_same_v) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Conversion between numeric types and UUID is not supported. " + "Probably the passed UUID is unquoted"); + } + else if constexpr ( + (std::is_same_v != std::is_same_v) + && !(is_any_of || is_any_of) + ) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from {} to {} is not supported", + TypeName, TypeName); + } + else if constexpr (std::is_same_v != std::is_same_v && !(std::is_same_v || std::is_same_v)) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Conversion between numeric types and IPv6 is not supported. " + "Probably the passed IPv6 is unquoted"); + } + else + { + if constexpr (IsDataTypeDecimal || IsDataTypeDecimal) + { + if constexpr (std::is_same_v) + { + ToFieldType result; + bool convert_result = false; + + if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + convert_result = tryConvertDecimals(vec_from[i], col_from->getScale(), col_to->getScale(), result); + else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) + convert_result = tryConvertFromDecimal(vec_from[i], col_from->getScale(), result); + else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) + convert_result = tryConvertToDecimal(vec_from[i], col_to->getScale(), result); + + if (convert_result) + vec_to[i] = result; + else + { + vec_to[i] = static_cast(0); + (*vec_null_map_to)[i] = true; + } + } + else + { + if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + vec_to[i] = convertDecimals(vec_from[i], col_from->getScale(), col_to->getScale()); + else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) + vec_to[i] = convertFromDecimal(vec_from[i], col_from->getScale()); + else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) + vec_to[i] = convertToDecimal(vec_from[i], col_to->getScale()); + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unsupported data type in conversion function"); + } + } + else + { + /// If From Data is Nan or Inf and we convert to integer type, throw exception + if constexpr (std::is_floating_point_v && !std::is_floating_point_v) + { + if (!isFinite(vec_from[i])) + { + if constexpr (std::is_same_v) + { + vec_to[i] = 0; + (*vec_null_map_to)[i] = true; + continue; + } + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unexpected inf or nan to integer conversion"); + } + } + + if constexpr (std::is_same_v + || std::is_same_v) + { + bool convert_result = accurate::convertNumeric(vec_from[i], vec_to[i]); + + if (!convert_result) + { + if (std::is_same_v) + { + vec_to[i] = 0; + (*vec_null_map_to)[i] = true; + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", + named_from.column->getName(), result_type->getName()); + } + } + } + else + { + if constexpr (std::is_same_v && std::is_same_v) + { + const uint8_t ip4_cidr[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; + const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); + if (!matchIPv6Subnet(src, ip4_cidr, 96)) + { + char addr[IPV6_MAX_TEXT_LENGTH + 1] {}; + char * paddr = addr; + formatIPv6(src, paddr); + + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "IPv6 {} in column {} is not in IPv4 mapping block", addr, named_from.column->getName()); + } + + uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); + if constexpr (std::endian::native == std::endian::little) + { + dst[0] = src[15]; + dst[1] = src[14]; + dst[2] = src[13]; + dst[3] = src[12]; + } + else + { + dst[0] = src[12]; + dst[1] = src[13]; + dst[2] = src[14]; + dst[3] = src[15]; + } + } + else if constexpr (std::is_same_v && std::is_same_v) + { + const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); + uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); + std::memset(dst, '\0', IPV6_BINARY_LENGTH); + dst[10] = dst[11] = 0xff; + + if constexpr (std::endian::native == std::endian::little) + { + dst[12] = src[3]; + dst[13] = src[2]; + dst[14] = src[1]; + dst[15] = src[0]; + } + else + { + dst[12] = src[0]; + dst[13] = src[1]; + dst[14] = src[2]; + dst[15] = src[3]; + } + } + else if constexpr (std::is_same_v && std::is_same_v) + vec_to[i] = static_cast(static_cast(vec_from[i])); + else if constexpr (std::is_same_v && (std::is_same_v || std::is_same_v)) + vec_to[i] = static_cast(vec_from[i] * DATE_SECONDS_PER_DAY); + else + vec_to[i] = static_cast(vec_from[i]); + } + } + } + } + + if constexpr (std::is_same_v) + return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); + else + return col_to; + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + named_from.column->getName(), Name::name); + } +}; + +/** Conversion of DateTime to Date: throw off time component. + */ +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +/** Conversion of DateTime to Date32: throw off time component. + */ +template +struct ConvertImpl + : DateTimeTransformImpl {}; + +/** Conversion of Date to DateTime: adding 00:00:00 time component. + */ +template +struct ToDateTimeImpl +{ + static constexpr auto name = "toDateTime"; + + static UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (d > MAX_DATETIME_DAY_NUM) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Day number {} is out of bounds of type DateTime", d); + } + else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) + { + if (d > MAX_DATETIME_DAY_NUM) + d = MAX_DATETIME_DAY_NUM; + } + return static_cast(time_zone.fromDayNum(DayNum(d))); + } + + static UInt32 execute(Int32 d, const DateLUTImpl & time_zone) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) + { + if (d < 0) + return 0; + else if (d > MAX_DATETIME_DAY_NUM) + d = MAX_DATETIME_DAY_NUM; + } + else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (d < 0 || d > MAX_DATETIME_DAY_NUM) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", d); + } + return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))); + } + + static UInt32 execute(UInt32 dt, const DateLUTImpl & /*time_zone*/) + { + return dt; + } + + static UInt32 execute(Int64 dt64, const DateLUTImpl & /*time_zone*/) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Ignore) + return static_cast(dt64); + else + { + if (dt64 < 0 || dt64 >= MAX_DATETIME_TIMESTAMP) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) + return dt64 < 0 ? 0 : std::numeric_limits::max(); + else + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", dt64); + } + else + return static_cast(dt64); + } + } +}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +/// Implementation of toDate function. + +template +struct ToDateTransform32Or64 +{ + static constexpr auto name = "toDate"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from > MAX_DATETIME_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type Date", from); + } + /// if value is smaller (or equal) than maximum day value for Date, than treat it as day num, + /// otherwise treat it as unix timestamp. This is a bit weird, but we leave this behavior. + if (from <= DATE_LUT_MAX_DAY_NUM) + return from; + else + return time_zone.toDayNum(std::min(time_t(from), time_t(MAX_DATETIME_TIMESTAMP))); + } +}; + +/** Conversion of Date32 to Date. + */ +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ToDateTransform32Or64Signed +{ + static constexpr auto name = "toDate"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) + { + // TODO: decide narrow or extended range based on FromType + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from < 0 || from > MAX_DATE_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type Date", from); + } + else + { + if (from < 0) + return 0; + } + return (from <= DATE_LUT_MAX_DAY_NUM) + ? static_cast(from) + : time_zone.toDayNum(std::min(time_t(from), time_t(MAX_DATE_TIMESTAMP))); + } +}; + +template +struct ToDateTransform8Or16Signed +{ + static constexpr auto name = "toDate"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) + { + if (from < 0) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type Date", from); + else + return 0; + } + return from; + } +}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +/// Implementation of toDate32 function. + +template +struct ToDate32Transform32Or64 +{ + static constexpr auto name = "toDate32"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) + { + if (from < DATE_LUT_MAX_EXTEND_DAY_NUM) + return static_cast(from); + else + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type Date32", from); + } + return time_zone.toDayNum(std::min(time_t(from), time_t(MAX_DATETIME64_TIMESTAMP))); + } + } +}; + +template +struct ToDate32Transform32Or64Signed +{ + static constexpr auto name = "toDate32"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) + { + static const Int32 daynum_min_offset = -static_cast(time_zone.getDayNumOffsetEpoch()); + + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from < daynum_min_offset || from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type Date32", from); + } + + if (from < daynum_min_offset) + return daynum_min_offset; + + return (from < DATE_LUT_MAX_EXTEND_DAY_NUM) + ? static_cast(from) + : time_zone.toDayNum(std::min(time_t(Int64(from)), time_t(MAX_DATETIME64_TIMESTAMP))); + } +}; + +template +struct ToDate32Transform8Or16Signed +{ + static constexpr auto name = "toDate32"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) + { + return from; + } +}; + +/** Special case of converting Int8, Int16, (U)Int32 or (U)Int64 (and also, for convenience, + * Float32, Float64) to Date. If the + * number is less than 65536, then it is treated as DayNum, and if it's greater or equals to 65536, + * then treated as unix timestamp. If the number exceeds UInt32, saturate to MAX_UINT32 then as DayNum. + * It's a bit illogical, as we actually have two functions in one. + * But allows to support frequent case, + * when user write toDate(UInt32), expecting conversion of unix timestamp to Date. + * (otherwise such usage would be frequent mistake). + */ +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + + +template +struct ToDateTimeTransform64 +{ + static constexpr auto name = "toDateTime"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from > MAX_DATETIME_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime", from); + } + return static_cast(std::min(time_t(from), time_t(MAX_DATETIME_TIMESTAMP))); + } +}; + +template +struct ToDateTimeTransformSigned +{ + static constexpr auto name = "toDateTime"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) + { + if (from < 0) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime", from); + else + return 0; + } + return from; + } +}; + +template +struct ToDateTimeTransform64Signed +{ + static constexpr auto name = "toDateTime"; + + static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from < 0 || from > MAX_DATETIME_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime", from); + } + + if (from < 0) + return 0; + return static_cast(std::min(time_t(from), time_t(MAX_DATETIME_TIMESTAMP))); + } +}; + +/// Special case of converting Int8, Int16, Int32 or (U)Int64 (and also, for convenience, Float32, Float64) to DateTime. +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +/** Conversion of numeric to DateTime64 + */ + +template +struct ToDateTime64TransformUnsigned +{ + static constexpr auto name = "toDateTime64"; + + const DateTime64::NativeType scale_multiplier = 1; + + ToDateTime64TransformUnsigned(UInt32 scale = 0) /// NOLINT + : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) + {} + + NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime64", from); + else + return DecimalUtils::decimalFromComponentsWithMultiplier(from, 0, scale_multiplier); + } + else + return DecimalUtils::decimalFromComponentsWithMultiplier(std::min(from, MAX_DATETIME64_TIMESTAMP), 0, scale_multiplier); + } +}; +template +struct ToDateTime64TransformSigned +{ + static constexpr auto name = "toDateTime64"; + + const DateTime64::NativeType scale_multiplier = 1; + + ToDateTime64TransformSigned(UInt32 scale = 0) /// NOLINT + : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) + {} + + NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from < MIN_DATETIME64_TIMESTAMP || from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime64", from); + } + from = static_cast(std::max(from, MIN_DATETIME64_TIMESTAMP)); + from = static_cast(std::min(from, MAX_DATETIME64_TIMESTAMP)); + + return DecimalUtils::decimalFromComponentsWithMultiplier(from, 0, scale_multiplier); + } +}; +template +struct ToDateTime64TransformFloat +{ + static constexpr auto name = "toDateTime64"; + + const UInt32 scale = 1; + + ToDateTime64TransformFloat(UInt32 scale_ = 0) /// NOLINT + : scale(scale_) + {} + + NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (from < MIN_DATETIME64_TIMESTAMP || from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime64", from); + } + + from = std::max(from, static_cast(MIN_DATETIME64_TIMESTAMP)); + from = std::min(from, static_cast(MAX_DATETIME64_TIMESTAMP)); + return convertToDecimal(from, scale); + } +}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl, false> {}; + + +/** Conversion of DateTime64 to Date or DateTime: discards fractional part. + */ +template +struct FromDateTime64Transform +{ + static constexpr auto name = Transform::name; + + const DateTime64::NativeType scale_multiplier = 1; + + FromDateTime64Transform(UInt32 scale) /// NOLINT + : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) + {} + + auto execute(DateTime64::NativeType dt, const DateLUTImpl & time_zone) const + { + const auto c = DecimalUtils::splitWithScaleMultiplier(DateTime64(dt), scale_multiplier); + return Transform::execute(static_cast(c.whole), time_zone); + } +}; + +/** Conversion of DateTime64 to Date or DateTime: discards fractional part. + */ +template +struct ConvertImpl + : DateTimeTransformImpl>, false> {}; + +template +struct ConvertImpl + : DateTimeTransformImpl>, false> {}; + +struct ToDateTime64Transform +{ + static constexpr auto name = "toDateTime64"; + + const DateTime64::NativeType scale_multiplier = 1; + + ToDateTime64Transform(UInt32 scale = 0) /// NOLINT + : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) + {} + + DateTime64::NativeType execute(UInt16 d, const DateLUTImpl & time_zone) const + { + const auto dt = ToDateTimeImpl<>::execute(d, time_zone); + return execute(dt, time_zone); + } + + DateTime64::NativeType execute(Int32 d, const DateLUTImpl & time_zone) const + { + Int64 dt = static_cast(time_zone.fromDayNum(ExtendedDayNum(d))); + return DecimalUtils::decimalFromComponentsWithMultiplier(dt, 0, scale_multiplier); + } + + DateTime64::NativeType execute(UInt32 dt, const DateLUTImpl & /*time_zone*/) const + { + return DecimalUtils::decimalFromComponentsWithMultiplier(dt, 0, scale_multiplier); + } +}; + +/** Conversion of Date or DateTime to DateTime64: add zero sub-second part. + */ +template +struct ConvertImpl + : DateTimeTransformImpl {}; + +template +struct ConvertImpl + : DateTimeTransformImpl {}; + +template +struct ConvertImpl + : DateTimeTransformImpl {}; + + +/** Transformation of numbers, dates, datetimes to strings: through formatting. + */ +template +struct FormatImpl +{ + template + static ReturnType execute(const typename DataType::FieldType x, WriteBuffer & wb, const DataType *, const DateLUTImpl *) + { + writeText(x, wb); + return ReturnType(true); + } +}; + +template <> +struct FormatImpl +{ + template + static ReturnType execute(const DataTypeDate::FieldType x, WriteBuffer & wb, const DataTypeDate *, const DateLUTImpl * time_zone) + { + writeDateText(DayNum(x), wb, *time_zone); + return ReturnType(true); + } +}; + +template <> +struct FormatImpl +{ + template + static ReturnType execute(const DataTypeDate32::FieldType x, WriteBuffer & wb, const DataTypeDate32 *, const DateLUTImpl * time_zone) + { + writeDateText(ExtendedDayNum(x), wb, *time_zone); + return ReturnType(true); + } +}; + +template <> +struct FormatImpl +{ + template + static ReturnType execute(const DataTypeDateTime::FieldType x, WriteBuffer & wb, const DataTypeDateTime *, const DateLUTImpl * time_zone) + { + writeDateTimeText(x, wb, *time_zone); + return ReturnType(true); + } +}; + +template <> +struct FormatImpl +{ + template + static ReturnType execute(const DataTypeDateTime64::FieldType x, WriteBuffer & wb, const DataTypeDateTime64 * type, const DateLUTImpl * time_zone) + { + writeDateTimeText(DateTime64(x), type->getScale(), wb, *time_zone); + return ReturnType(true); + } +}; + + +template +struct FormatImpl> +{ + template + static ReturnType execute(const FieldType x, WriteBuffer & wb, const DataTypeEnum * type, const DateLUTImpl *) + { + static constexpr bool throw_exception = std::is_same_v; + + if constexpr (throw_exception) + { + writeString(type->getNameForValue(x), wb); + } + else + { + StringRef res; + bool is_ok = type->getNameForValue(x, res); + if (is_ok) + writeString(res, wb); + return ReturnType(is_ok); + } + } +}; + +template +struct FormatImpl> +{ + template + static ReturnType execute(const FieldType x, WriteBuffer & wb, const DataTypeDecimal * type, const DateLUTImpl *) + { + writeText(x, type->getScale(), wb, false); + return ReturnType(true); + } +}; + + +/// DataTypeEnum to DataType free conversion +template +struct ConvertImpl, DataTypeNumber, Name, ConvertDefaultBehaviorTag> +{ + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) + { + return arguments[0].column; + } +}; + +static inline ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) +{ + ColumnUInt8::MutablePtr null_map = nullptr; + if (const auto * col_null = checkAndGetColumn(col.get())) + { + null_map = ColumnUInt8::create(); + null_map->insertRangeFrom(col_null->getNullMapColumn(), 0, col_null->size()); + } + return null_map; +} + +template +requires (!std::is_same_v) +struct ConvertImpl +{ + using FromFieldType = typename FromDataType::FieldType; + using ColVecType = ColumnVectorOrDecimal; + + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) + { + if constexpr (IsDataTypeDateOrDateTime) + { + auto datetime_arg = arguments[0]; + + const DateLUTImpl * time_zone = nullptr; + const ColumnConst * time_zone_column = nullptr; + + if (arguments.size() == 1) + { + auto non_null_args = createBlockWithNestedColumns(arguments); + time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0); + } + else /// When we have a column for timezone + { + datetime_arg.column = datetime_arg.column->convertToFullColumnIfConst(); + + if constexpr (std::is_same_v || std::is_same_v) + time_zone = &DateLUT::instance(); + /// For argument of Date or DateTime type, second argument with time zone could be specified. + if constexpr (std::is_same_v || std::is_same_v) + { + if ((time_zone_column = checkAndGetColumnConst(arguments[1].column.get()))) + { + auto non_null_args = createBlockWithNestedColumns(arguments); + time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0); + } + } + } + const auto & col_with_type_and_name = columnGetNested(datetime_arg); + + if (const auto col_from = checkAndGetColumn(col_with_type_and_name.column.get())) + { + auto col_to = ColumnString::create(); + + const typename ColVecType::Container & vec_from = col_from->getData(); + ColumnString::Chars & data_to = col_to->getChars(); + ColumnString::Offsets & offsets_to = col_to->getOffsets(); + size_t size = vec_from.size(); + + if constexpr (std::is_same_v) + data_to.resize(size * (strlen("YYYY-MM-DD") + 1)); + else if constexpr (std::is_same_v) + data_to.resize(size * (strlen("YYYY-MM-DD") + 1)); + else if constexpr (std::is_same_v) + data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1)); + else if constexpr (std::is_same_v) + data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1)); + else + data_to.resize(size * 3); /// Arbitrary + + offsets_to.resize(size); + + WriteBufferFromVector write_buffer(data_to); + const auto & type = static_cast(*col_with_type_and_name.type); + + ColumnUInt8::MutablePtr null_map = copyNullMap(datetime_arg.column); + + if (!null_map && arguments.size() > 1) + null_map = copyNullMap(arguments[1].column->convertToFullColumnIfConst()); + + if (null_map) + { + for (size_t i = 0; i < size; ++i) + { + if (!time_zone_column && arguments.size() > 1) + { + if (!arguments[1].column.get()->getDataAt(i).toString().empty()) + time_zone = &DateLUT::instance(arguments[1].column.get()->getDataAt(i).toString()); + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty"); + } + bool is_ok = FormatImpl::template execute(vec_from[i], write_buffer, &type, time_zone); + null_map->getData()[i] |= !is_ok; + writeChar(0, write_buffer); + offsets_to[i] = write_buffer.count(); + } + } + else + { + for (size_t i = 0; i < size; ++i) + { + if (!time_zone_column && arguments.size() > 1) + { + if (!arguments[1].column.get()->getDataAt(i).toString().empty()) + time_zone = &DateLUT::instance(arguments[1].column.get()->getDataAt(i).toString()); + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty"); + } + FormatImpl::template execute(vec_from[i], write_buffer, &type, time_zone); + writeChar(0, write_buffer); + offsets_to[i] = write_buffer.count(); + } + } + + write_buffer.finalize(); + + if (null_map) + return ColumnNullable::create(std::move(col_to), std::move(null_map)); + return col_to; + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), Name::name); + } + else + { + ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); + + const auto & col_with_type_and_name = columnGetNested(arguments[0]); + const auto & type = static_cast(*col_with_type_and_name.type); + + if (const auto col_from = checkAndGetColumn(col_with_type_and_name.column.get())) + { + auto col_to = ColumnString::create(); + + const typename ColVecType::Container & vec_from = col_from->getData(); + ColumnString::Chars & data_to = col_to->getChars(); + ColumnString::Offsets & offsets_to = col_to->getOffsets(); + size_t size = vec_from.size(); + + data_to.resize(size * 3); + offsets_to.resize(size); + + WriteBufferFromVector write_buffer(data_to); + + if (null_map) + { + for (size_t i = 0; i < size; ++i) + { + bool is_ok = FormatImpl::template execute(vec_from[i], write_buffer, &type, nullptr); + /// We don't use timezones in this branch + null_map->getData()[i] |= !is_ok; + writeChar(0, write_buffer); + offsets_to[i] = write_buffer.count(); + } + } + else + { + for (size_t i = 0; i < size; ++i) + { + FormatImpl::template execute(vec_from[i], write_buffer, &type, nullptr); + writeChar(0, write_buffer); + offsets_to[i] = write_buffer.count(); + } + } + + write_buffer.finalize(); + + if (null_map) + return ColumnNullable::create(std::move(col_to), std::move(null_map)); + return col_to; + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), Name::name); + } + } +}; + + +/// Generic conversion of any type to String or FixedString via serialization to text. +template +struct ConvertImplGenericToString +{ + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) + { + static_assert(std::is_same_v || std::is_same_v, + "Can be used only to serialize to ColumnString or ColumnFixedString"); + + ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); + + const auto & col_with_type_and_name = columnGetNested(arguments[0]); + const IDataType & type = *col_with_type_and_name.type; + const IColumn & col_from = *col_with_type_and_name.column; + + size_t size = col_from.size(); + auto col_to = removeNullable(result_type)->createColumn(); + + { + ColumnStringHelpers::WriteHelper write_helper( + assert_cast(*col_to), + size); + + auto & write_buffer = write_helper.getWriteBuffer(); + + FormatSettings format_settings; + auto serialization = type.getDefaultSerialization(); + for (size_t row = 0; row < size; ++row) + { + serialization->serializeText(col_from, row, write_buffer, format_settings); + write_helper.rowWritten(); + } + + write_helper.finalize(); + } + + if (result_type->isNullable() && null_map) + return ColumnNullable::create(std::move(col_to), std::move(null_map)); + return col_to; + } +}; + +/** Conversion of time_t to UInt16, Int32, UInt32 + */ +template +void convertFromTime(typename DataType::FieldType & x, time_t & time) +{ + x = time; +} + +template <> +inline void convertFromTime(DataTypeDate::FieldType & x, time_t & time) +{ + if (unlikely(time < 0)) + x = 0; + else if (unlikely(time > 0xFFFF)) + x = 0xFFFF; + else + x = time; +} + +template <> +inline void convertFromTime(DataTypeDate32::FieldType & x, time_t & time) +{ + x = static_cast(time); +} + +template <> +inline void convertFromTime(DataTypeDateTime::FieldType & x, time_t & time) +{ + if (unlikely(time < 0)) + x = 0; + else if (unlikely(time > MAX_DATETIME_TIMESTAMP)) + x = MAX_DATETIME_TIMESTAMP; + else + x = static_cast(time); +} + +/** Conversion of strings to numbers, dates, datetimes: through parsing. + */ +template +void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing) +{ + if constexpr (std::is_floating_point_v) + { + if (precise_float_parsing) + readFloatTextPrecise(x, rb); + else + readFloatTextFast(x, rb); + } + else + readText(x, rb); +} + +template <> +inline void parseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) +{ + DayNum tmp(0); + readDateText(tmp, rb, *time_zone); + x = tmp; +} + +template <> +inline void parseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) +{ + ExtendedDayNum tmp(0); + readDateText(tmp, rb, *time_zone); + x = tmp; +} + + +// NOTE: no need of extra overload of DateTime64, since readDateTimeText64 has different signature and that case is explicitly handled in the calling code. +template <> +inline void parseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) +{ + time_t time = 0; + readDateTimeText(time, rb, *time_zone); + convertFromTime(x, time); +} + +template <> +inline void parseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) +{ + UUID tmp; + readUUIDText(tmp, rb); + x = tmp.toUnderType(); +} + +template <> +inline void parseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) +{ + IPv4 tmp; + readIPv4Text(tmp, rb); + x = tmp.toUnderType(); +} + +template <> +inline void parseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) +{ + IPv6 tmp; + readIPv6Text(tmp, rb); + x = tmp; +} + +template +bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing) +{ + if constexpr (std::is_floating_point_v) + { + if (precise_float_parsing) + return tryReadFloatTextPrecise(x, rb); + else + return tryReadFloatTextFast(x, rb); + } + else /*if constexpr (is_integer_v)*/ + return tryReadIntText(x, rb); +} + +template <> +inline bool tryParseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) +{ + DayNum tmp(0); + if (!tryReadDateText(tmp, rb, *time_zone)) + return false; + x = tmp; + return true; +} + +template <> +inline bool tryParseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) +{ + ExtendedDayNum tmp(0); + if (!tryReadDateText(tmp, rb, *time_zone)) + return false; + x = tmp; + return true; +} + +template <> +inline bool tryParseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) +{ + time_t time = 0; + if (!tryReadDateTimeText(time, rb, *time_zone)) + return false; + convertFromTime(x, time); + return true; +} + +template <> +inline bool tryParseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) +{ + UUID tmp; + if (!tryReadUUIDText(tmp, rb)) + return false; + + x = tmp.toUnderType(); + return true; +} + +template <> +inline bool tryParseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) +{ + IPv4 tmp; + if (!tryReadIPv4Text(tmp, rb)) + return false; + + x = tmp.toUnderType(); + return true; +} + +template <> +inline bool tryParseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) +{ + IPv6 tmp; + if (!tryReadIPv6Text(tmp, rb)) + return false; + + x = tmp; + return true; +} + + +/** Throw exception with verbose message when string value is not parsed completely. + */ +[[noreturn]] inline void throwExceptionForIncompletelyParsedValue(ReadBuffer & read_buffer, const IDataType & result_type) +{ + WriteBufferFromOwnString message_buf; + message_buf << "Cannot parse string " << quote << String(read_buffer.buffer().begin(), read_buffer.buffer().size()) + << " as " << result_type.getName() + << ": syntax error"; + + if (read_buffer.offset()) + message_buf << " at position " << read_buffer.offset() + << " (parsed just " << quote << String(read_buffer.buffer().begin(), read_buffer.offset()) << ")"; + else + message_buf << " at begin of string"; + + // Currently there are no functions toIPv{4,6}Or{Null,Zero} + if (isNativeNumber(result_type) && !(result_type.getName() == "IPv4" || result_type.getName() == "IPv6")) + message_buf << ". Note: there are to" << result_type.getName() << "OrZero and to" << result_type.getName() << "OrNull functions, which returns zero/NULL instead of throwing exception."; + + throw Exception(PreformattedMessage{message_buf.str(), "Cannot parse string {} as {}: syntax error {}"}, ErrorCodes::CANNOT_PARSE_TEXT); +} + + +enum class ConvertFromStringExceptionMode +{ + Throw, /// Throw exception if value cannot be parsed. + Zero, /// Fill with zero or default if value cannot be parsed. + Null /// Return ColumnNullable with NULLs when value cannot be parsed. +}; + +enum class ConvertFromStringParsingMode +{ + Normal, + BestEffort, /// Only applicable for DateTime. Will use sophisticated method, that is slower. + BestEffortUS +}; + +template +struct ConvertThroughParsing +{ + static_assert(std::is_same_v || std::is_same_v, + "ConvertThroughParsing is only applicable for String or FixedString data types"); + + static constexpr bool to_datetime64 = std::is_same_v; + + static bool isAllRead(ReadBuffer & in) + { + /// In case of FixedString, skip zero bytes at end. + if constexpr (std::is_same_v) + while (!in.eof() && *in.position() == 0) + ++in.position(); + + if (in.eof()) + return true; + + /// Special case, that allows to parse string with DateTime or DateTime64 as Date or Date32. + if constexpr (std::is_same_v || std::is_same_v) + { + if (!in.eof() && (*in.position() == ' ' || *in.position() == 'T')) + { + if (in.buffer().size() == strlen("YYYY-MM-DD hh:mm:ss")) + return true; + + if (in.buffer().size() >= strlen("YYYY-MM-DD hh:mm:ss.x") + && in.buffer().begin()[19] == '.') + { + in.position() = in.buffer().begin() + 20; + + while (!in.eof() && isNumericASCII(*in.position())) + ++in.position(); + + if (in.eof()) + return true; + } + } + } + + return false; + } + + template + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, size_t input_rows_count, + Additions additions [[maybe_unused]] = Additions()) + { + using ColVecTo = typename ToDataType::ColumnType; + + const DateLUTImpl * local_time_zone [[maybe_unused]] = nullptr; + const DateLUTImpl * utc_time_zone [[maybe_unused]] = nullptr; + + /// For conversion to Date or DateTime type, second argument with time zone could be specified. + if constexpr (std::is_same_v || to_datetime64) + { + const auto result_type = removeNullable(res_type); + // Time zone is already figured out during result type resolution, no need to do it here. + if (const auto dt_col = checkAndGetDataType(result_type.get())) + local_time_zone = &dt_col->getTimeZone(); + else + local_time_zone = &extractTimeZoneFromFunctionArguments(arguments, 1, 0); + + if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffort || parsing_mode == ConvertFromStringParsingMode::BestEffortUS) + utc_time_zone = &DateLUT::instance("UTC"); + } + else if constexpr (std::is_same_v || std::is_same_v) + { + // Timezone is more or less dummy when parsing Date/Date32 from string. + local_time_zone = &DateLUT::instance(); + utc_time_zone = &DateLUT::instance("UTC"); + } + + const IColumn * col_from = arguments[0].column.get(); + const ColumnString * col_from_string = checkAndGetColumn(col_from); + const ColumnFixedString * col_from_fixed_string = checkAndGetColumn(col_from); + + if (std::is_same_v && !col_from_string) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + col_from->getName(), Name::name); + + if (std::is_same_v && !col_from_fixed_string) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + col_from->getName(), Name::name); + + size_t size = input_rows_count; + typename ColVecTo::MutablePtr col_to = nullptr; + + if constexpr (IsDataTypeDecimal) + { + UInt32 scale = additions; + if constexpr (to_datetime64) + { + ToDataType check_bounds_in_ctor(scale, local_time_zone ? local_time_zone->getTimeZone() : String{}); + } + else + { + ToDataType check_bounds_in_ctor(ToDataType::maxPrecision(), scale); + } + col_to = ColVecTo::create(size, scale); + } + else + col_to = ColVecTo::create(size); + + typename ColVecTo::Container & vec_to = col_to->getData(); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; + if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) + { + col_null_map_to = ColumnUInt8::create(size); + vec_null_map_to = &col_null_map_to->getData(); + } + + const ColumnString::Chars * chars = nullptr; + const IColumn::Offsets * offsets = nullptr; + size_t fixed_string_size = 0; + + if constexpr (std::is_same_v) + { + chars = &col_from_string->getChars(); + offsets = &col_from_string->getOffsets(); + } + else + { + chars = &col_from_fixed_string->getChars(); + fixed_string_size = col_from_fixed_string->getN(); + } + + size_t current_offset = 0; + + bool precise_float_parsing = false; + + if (DB::CurrentThread::isInitialized()) + { + const DB::ContextPtr query_context = DB::CurrentThread::get().getQueryContext(); + + if (query_context) + precise_float_parsing = query_context->getSettingsRef().precise_float_parsing; + } + + for (size_t i = 0; i < size; ++i) + { + size_t next_offset = std::is_same_v ? (*offsets)[i] : (current_offset + fixed_string_size); + size_t string_size = std::is_same_v ? next_offset - current_offset - 1 : fixed_string_size; + + ReadBufferFromMemory read_buffer(&(*chars)[current_offset], string_size); + + if constexpr (exception_mode == ConvertFromStringExceptionMode::Throw) + { + if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffort) + { + if constexpr (to_datetime64) + { + DateTime64 res = 0; + parseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); + vec_to[i] = res; + } + else + { + time_t res; + parseDateTimeBestEffort(res, read_buffer, *local_time_zone, *utc_time_zone); + convertFromTime(vec_to[i], res); + } + } + else if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffortUS) + { + if constexpr (to_datetime64) + { + DateTime64 res = 0; + parseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); + vec_to[i] = res; + } + else + { + time_t res; + parseDateTimeBestEffortUS(res, read_buffer, *local_time_zone, *utc_time_zone); + convertFromTime(vec_to[i], res); + } + } + else + { + if constexpr (to_datetime64) + { + DateTime64 value = 0; + readDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone); + vec_to[i] = value; + } + else if constexpr (IsDataTypeDecimal) + { + SerializationDecimal::readText( + vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale()); + } + else + { + /// we want to utilize constexpr condition here, which is not mixable with value comparison + do + { + if constexpr (std::is_same_v && std::is_same_v) + { + if (fixed_string_size == IPV6_BINARY_LENGTH) + { + readBinary(vec_to[i], read_buffer); + break; + } + } + parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); + } while (false); + } + } + + if (!isAllRead(read_buffer)) + throwExceptionForIncompletelyParsedValue(read_buffer, *res_type); + } + else + { + bool parsed; + + if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffort) + { + if constexpr (to_datetime64) + { + DateTime64 res = 0; + parsed = tryParseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); + vec_to[i] = res; + } + else + { + time_t res; + parsed = tryParseDateTimeBestEffort(res, read_buffer, *local_time_zone, *utc_time_zone); + convertFromTime(vec_to[i],res); + } + } + else if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffortUS) + { + if constexpr (to_datetime64) + { + DateTime64 res = 0; + parsed = tryParseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); + vec_to[i] = res; + } + else + { + time_t res; + parsed = tryParseDateTimeBestEffortUS(res, read_buffer, *local_time_zone, *utc_time_zone); + convertFromTime(vec_to[i],res); + } + } + else + { + if constexpr (to_datetime64) + { + DateTime64 value = 0; + parsed = tryReadDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone); + vec_to[i] = value; + } + else if constexpr (IsDataTypeDecimal) + { + parsed = SerializationDecimal::tryReadText( + vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale()); + } + else + { + /// we want to utilize constexpr condition here, which is not mixable with value comparison + do + { + if constexpr (std::is_same_v && std::is_same_v) + { + if (fixed_string_size == IPV6_BINARY_LENGTH) + { + readBinary(vec_to[i], read_buffer); + parsed = true; + break; + } + } + + parsed = tryParseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); + } while (false); + } + } + + if (!isAllRead(read_buffer)) + parsed = false; + + if (!parsed) + { + if constexpr (std::is_same_v) + { + vec_to[i] = -static_cast(DateLUT::instance().getDayNumOffsetEpoch()); + } + else + { + vec_to[i] = static_cast(0); + } + } + + if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) + (*vec_null_map_to)[i] = !parsed; + } + + current_offset = next_offset; + } + + if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) + return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); + else + return col_to; + } +}; + + +template +requires (!std::is_same_v) +struct ConvertImpl + : ConvertThroughParsing {}; + +template +requires (!std::is_same_v) +struct ConvertImpl + : ConvertThroughParsing {}; + +template +requires (!std::is_same_v) +struct ConvertImpl + : ConvertThroughParsing {}; + +template +requires (!std::is_same_v) +struct ConvertImpl + : ConvertThroughParsing {}; + +template +requires (is_any_of && is_any_of) +struct ConvertImpl + : ConvertThroughParsing {}; + +/// Generic conversion of any type from String. Used for complex types: Array and Tuple or types with custom serialization. +struct ConvertImplGenericFromString +{ + static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) + { + const IColumn & column_from = *arguments[0].column; + const IDataType & data_type_to = *result_type; + auto res = data_type_to.createColumn(); + auto serialization = data_type_to.getDefaultSerialization(); + const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; + + executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get()); + return res; + } + + static void executeImpl( + const IColumn & column_from, + IColumn & column_to, + const ISerialization & serialization_from, + size_t input_rows_count, + const PaddedPODArray * null_map, + const IDataType * result_type) + { + column_to.reserve(input_rows_count); + + FormatSettings format_settings; + for (size_t i = 0; i < input_rows_count; ++i) + { + if (null_map && (*null_map)[i]) + { + column_to.insertDefault(); + continue; + } + + const auto & val = column_from.getDataAt(i); + ReadBufferFromMemory read_buffer(val.data, val.size); + try + { + serialization_from.deserializeWholeText(column_to, read_buffer, format_settings); + } + catch (const Exception & e) + { + auto * nullable_column = typeid_cast(&column_to); + if (e.code() == ErrorCodes::CANNOT_PARSE_BOOL && nullable_column) + { + auto & col_nullmap = nullable_column->getNullMapData(); + if (col_nullmap.size() != nullable_column->size()) + col_nullmap.resize_fill(nullable_column->size()); + if (nullable_column->size() == (i + 1)) + nullable_column->popBack(1); + nullable_column->insertDefault(); + continue; + } + throw; + } + + if (!read_buffer.eof()) + { + if (result_type) + throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); + else + throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, + "Cannot parse string to column {}. Expected eof", column_to.getName()); + } + } + } +}; + + +template <> +struct ConvertImpl + : ConvertImpl {}; + +template <> +struct ConvertImpl + : ConvertImpl {}; + +/** If types are identical, just take reference to column. + */ +template +requires (!T::is_parametric) +struct ConvertImpl +{ + template + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, + Additions additions [[maybe_unused]] = Additions()) + { + return arguments[0].column; + } +}; + +template +struct ConvertImpl +{ + template + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, + Additions additions [[maybe_unused]] = Additions()) + { + + return arguments[0].column; + } +}; + + +/** Conversion from FixedString to String. + * Cutting sequences of zero bytes from end of strings. + */ +template +struct ConvertImpl +{ + static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type, size_t /*input_rows_count*/) + { + ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); + const auto & nested = columnGetNested(arguments[0]); + if (const ColumnFixedString * col_from = checkAndGetColumn(nested.column.get())) + { + auto col_to = ColumnString::create(); + + const ColumnFixedString::Chars & data_from = col_from->getChars(); + ColumnString::Chars & data_to = col_to->getChars(); + ColumnString::Offsets & offsets_to = col_to->getOffsets(); + size_t size = col_from->size(); + size_t n = col_from->getN(); + data_to.resize(size * (n + 1)); /// + 1 - zero terminator + offsets_to.resize(size); + + size_t offset_from = 0; + size_t offset_to = 0; + for (size_t i = 0; i < size; ++i) + { + if (!null_map || !null_map->getData()[i]) + { + size_t bytes_to_copy = n; + while (bytes_to_copy > 0 && data_from[offset_from + bytes_to_copy - 1] == 0) + --bytes_to_copy; + + memcpy(&data_to[offset_to], &data_from[offset_from], bytes_to_copy); + offset_to += bytes_to_copy; + } + data_to[offset_to] = 0; + ++offset_to; + offsets_to[i] = offset_to; + offset_from += n; + } + + data_to.resize(offset_to); + if (return_type->isNullable() && null_map) + return ColumnNullable::create(std::move(col_to), std::move(null_map)); + return col_to; + } + else + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + arguments[0].column->getName(), Name::name); + } +}; + + +/// Declared early because used below. +struct NameToDate { static constexpr auto name = "toDate"; }; +struct NameToDate32 { static constexpr auto name = "toDate32"; }; +struct NameToDateTime { static constexpr auto name = "toDateTime"; }; +struct NameToDateTime32 { static constexpr auto name = "toDateTime32"; }; +struct NameToDateTime64 { static constexpr auto name = "toDateTime64"; }; +struct NameToString { static constexpr auto name = "toString"; }; +struct NameToDecimal32 { static constexpr auto name = "toDecimal32"; }; +struct NameToDecimal64 { static constexpr auto name = "toDecimal64"; }; +struct NameToDecimal128 { static constexpr auto name = "toDecimal128"; }; +struct NameToDecimal256 { static constexpr auto name = "toDecimal256"; }; + + +#define DEFINE_NAME_TO_INTERVAL(INTERVAL_KIND) \ + struct NameToInterval ## INTERVAL_KIND \ + { \ + static constexpr auto name = "toInterval" #INTERVAL_KIND; \ + static constexpr auto kind = IntervalKind::Kind::INTERVAL_KIND; \ + }; + +DEFINE_NAME_TO_INTERVAL(Nanosecond) +DEFINE_NAME_TO_INTERVAL(Microsecond) +DEFINE_NAME_TO_INTERVAL(Millisecond) +DEFINE_NAME_TO_INTERVAL(Second) +DEFINE_NAME_TO_INTERVAL(Minute) +DEFINE_NAME_TO_INTERVAL(Hour) +DEFINE_NAME_TO_INTERVAL(Day) +DEFINE_NAME_TO_INTERVAL(Week) +DEFINE_NAME_TO_INTERVAL(Month) +DEFINE_NAME_TO_INTERVAL(Quarter) +DEFINE_NAME_TO_INTERVAL(Year) + +#undef DEFINE_NAME_TO_INTERVAL + +struct NameParseDateTimeBestEffort; +struct NameParseDateTimeBestEffortOrZero; +struct NameParseDateTimeBestEffortOrNull; + +template +static inline bool isDateTime64(const ColumnsWithTypeAndName & arguments) +{ + if constexpr (std::is_same_v) + return true; + else if constexpr (std::is_same_v || std::is_same_v + || std::is_same_v || std::is_same_v) + { + return (arguments.size() == 2 && isUInt(arguments[1].type)) || arguments.size() == 3; + } + + return false; +} + +template +class FunctionConvert : public IFunction +{ +public: + using Monotonic = MonotonicityImpl; + + static constexpr auto name = Name::name; + static constexpr bool to_decimal = + std::is_same_v || std::is_same_v + || std::is_same_v || std::is_same_v; + + static constexpr bool to_datetime64 = std::is_same_v; + + static constexpr bool to_string_or_fixed_string = std::is_same_v || + std::is_same_v; + + static constexpr bool to_date_or_datetime = std::is_same_v || + std::is_same_v || + std::is_same_v; + + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + static FunctionPtr create() { return std::make_shared(); } + + FunctionConvert() = default; + explicit FunctionConvert(ContextPtr context_) : context(context_) {} + + String getName() const override + { + return name; + } + + bool isVariadic() const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + bool isInjective(const ColumnsWithTypeAndName &) const override { return std::is_same_v; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override + { + /// TODO: We can make more optimizations here. + return !(to_date_or_datetime && isNumber(*arguments[0].type)); + } + + using DefaultReturnTypeGetter = std::function; + static DataTypePtr getReturnTypeDefaultImplementationForNulls(const ColumnsWithTypeAndName & arguments, const DefaultReturnTypeGetter & getter) + { + NullPresence null_presence = getNullPresense(arguments); + + if (null_presence.has_null_constant) + { + return makeNullable(std::make_shared()); + } + if (null_presence.has_nullable) + { + auto nested_columns = Block(createBlockWithNestedColumns(arguments)); + auto return_type = getter(ColumnsWithTypeAndName(nested_columns.begin(), nested_columns.end())); + return makeNullable(return_type); + } + + return getter(arguments); + } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + auto getter = [&] (const auto & args) { return getReturnTypeImplRemovedNullable(args); }; + auto res = getReturnTypeDefaultImplementationForNulls(arguments, getter); + to_nullable = res->isNullable(); + checked_return_type = true; + return res; + } + + DataTypePtr getReturnTypeImplRemovedNullable(const ColumnsWithTypeAndName & arguments) const + { + FunctionArgumentDescriptors mandatory_args = {{"Value", nullptr, nullptr, nullptr}}; + FunctionArgumentDescriptors optional_args; + + if constexpr (to_decimal) + { + mandatory_args.push_back({"scale", static_cast(&isNativeInteger), &isColumnConst, "const Integer"}); + } + + if (!to_decimal && isDateTime64(arguments)) + { + mandatory_args.push_back({"scale", static_cast(&isNativeInteger), &isColumnConst, "const Integer"}); + } + + // toString(DateTime or DateTime64, [timezone: String]) + if ((std::is_same_v && !arguments.empty() && (isDateTime64(arguments[0].type) || isDateTime(arguments[0].type))) + // toUnixTimestamp(value[, timezone : String]) + || std::is_same_v + // toDate(value[, timezone : String]) + || std::is_same_v // TODO: shall we allow timestamp argument for toDate? DateTime knows nothing about timezones and this argument is ignored below. + // toDate32(value[, timezone : String]) + || std::is_same_v + // toDateTime(value[, timezone: String]) + || std::is_same_v + // toDateTime64(value, scale : Integer[, timezone: String]) + || std::is_same_v) + { + optional_args.push_back({"timezone", static_cast(&isString), nullptr, "String"}); + } + + validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); + + if constexpr (std::is_same_v) + { + return std::make_shared(Name::kind); + } + else if constexpr (to_decimal) + { + UInt64 scale = extractToDecimalScale(arguments[1]); + + if constexpr (std::is_same_v) + return createDecimalMaxPrecision(scale); + else if constexpr (std::is_same_v) + return createDecimalMaxPrecision(scale); + else if constexpr (std::is_same_v) + return createDecimalMaxPrecision(scale); + else if constexpr (std::is_same_v) + return createDecimalMaxPrecision(scale); + + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected branch in code of conversion function: it is a bug."); + } + else + { + // Optional second argument with time zone for DateTime. + UInt8 timezone_arg_position = 1; + UInt32 scale [[maybe_unused]] = DataTypeDateTime64::default_scale; + + // DateTime64 requires more arguments: scale and timezone. Since timezone is optional, scale should be first. + if (isDateTime64(arguments)) + { + timezone_arg_position += 1; + scale = static_cast(arguments[1].column->get64(0)); + + if (to_datetime64 || scale != 0) /// toDateTime('xxxx-xx-xx xx:xx:xx', 0) return DateTime + return std::make_shared(scale, + extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); + + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); + } + + if constexpr (std::is_same_v) + return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); + else if constexpr (std::is_same_v) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected branch in code of conversion function: it is a bug."); + else + return std::make_shared(); + } + } + + /// Function actually uses default implementation for nulls, + /// but we need to know if return type is Nullable or not, + /// so we use checked_return_type only to intercept the first call to getReturnTypeImpl(...). + bool useDefaultImplementationForNulls() const override + { + bool to_nullable_string = to_nullable && std::is_same_v; + return checked_return_type && !to_nullable_string; + } + + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override + { + if constexpr (std::is_same_v) + return {}; + else if constexpr (std::is_same_v) + return {2}; + return {1}; + } + bool canBeExecutedOnDefaultArguments() const override { return false; } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + try + { + return executeInternal(arguments, result_type, input_rows_count); + } + catch (Exception & e) + { + /// More convenient error message. + if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) + { + e.addMessage("Cannot parse " + + result_type->getName() + " from " + + arguments[0].type->getName() + + ", because value is too short"); + } + else if (e.code() == ErrorCodes::CANNOT_PARSE_NUMBER + || e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT + || e.code() == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED + || e.code() == ErrorCodes::CANNOT_PARSE_QUOTED_STRING + || e.code() == ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE + || e.code() == ErrorCodes::CANNOT_PARSE_DATE + || e.code() == ErrorCodes::CANNOT_PARSE_DATETIME + || e.code() == ErrorCodes::CANNOT_PARSE_UUID + || e.code() == ErrorCodes::CANNOT_PARSE_IPV4 + || e.code() == ErrorCodes::CANNOT_PARSE_IPV6) + { + e.addMessage("Cannot parse " + + result_type->getName() + " from " + + arguments[0].type->getName()); + } + + throw; + } + } + + bool hasInformationAboutMonotonicity() const override + { + return Monotonic::has(); + } + + Monotonicity getMonotonicityForRange(const IDataType & type, const Field & left, const Field & right) const override + { + return Monotonic::get(type, left, right); + } + +private: + ContextPtr context; + mutable bool checked_return_type = false; + mutable bool to_nullable = false; + + ColumnPtr executeInternal(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const + { + if (arguments.empty()) + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects at least 1 argument", getName()); + + if (result_type->onlyNull()) + return result_type->createColumnConstWithDefaultValue(input_rows_count); + + const DataTypePtr from_type = removeNullable(arguments[0].type); + ColumnPtr result_column; + + [[maybe_unused]] FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior = default_date_time_overflow_behavior; + + if (context) + date_time_overflow_behavior = context->getSettingsRef().date_time_overflow_behavior.value; + + auto call = [&](const auto & types, const auto & tag) -> bool + { + using Types = std::decay_t; + using LeftDataType = typename Types::LeftType; + using RightDataType = typename Types::RightType; + using SpecialTag = std::decay_t; + + if constexpr (IsDataTypeDecimal) + { + if constexpr (std::is_same_v) + { + /// Account for optional timezone argument. + if (arguments.size() != 2 && arguments.size() != 3) + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects 2 or 3 arguments for DataTypeDateTime64.", getName()); + } + else if (arguments.size() != 2) + { + throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects 2 arguments for Decimal.", getName()); + } + + const ColumnWithTypeAndName & scale_column = arguments[1]; + UInt32 scale = extractToDecimalScale(scale_column); + + switch (date_time_overflow_behavior) + { + case FormatSettings::DateTimeOverflowBehavior::Throw: + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); + break; + case FormatSettings::DateTimeOverflowBehavior::Ignore: + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); + break; + case FormatSettings::DateTimeOverflowBehavior::Saturate: + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); + break; + } + + } + else if constexpr (IsDataTypeDateOrDateTime && std::is_same_v) + { + const auto * dt64 = assert_cast(arguments[0].type.get()); + switch (date_time_overflow_behavior) + { + case FormatSettings::DateTimeOverflowBehavior::Throw: + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, dt64->getScale()); + break; + case FormatSettings::DateTimeOverflowBehavior::Ignore: + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, dt64->getScale()); + break; + case FormatSettings::DateTimeOverflowBehavior::Saturate: + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, dt64->getScale()); + break; + } + } +#define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE) \ + case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ + result_column = ConvertImpl::execute( \ + arguments, result_type, input_rows_count); \ + break; + + else if constexpr (IsDataTypeDecimalOrNumber && IsDataTypeDecimalOrNumber) + { + using LeftT = typename LeftDataType::FieldType; + using RightT = typename RightDataType::FieldType; + + static constexpr bool bad_left = + is_decimal || std::is_floating_point_v || is_big_int_v || is_signed_v; + static constexpr bool bad_right = + is_decimal || std::is_floating_point_v || is_big_int_v || is_signed_v; + + /// Disallow int vs UUID conversion (but support int vs UInt128 conversion) + if constexpr ((bad_left && std::is_same_v) || + (bad_right && std::is_same_v)) + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Wrong UUID conversion"); + } + else + { + switch (date_time_overflow_behavior) + { + GENERATE_OVERFLOW_MODE_CASE(Throw) + GENERATE_OVERFLOW_MODE_CASE(Ignore) + GENERATE_OVERFLOW_MODE_CASE(Saturate) + } + } + } + else if constexpr ((IsDataTypeNumber || IsDataTypeDateOrDateTime) + && IsDataTypeDateOrDateTime) + { + switch (date_time_overflow_behavior) + { + GENERATE_OVERFLOW_MODE_CASE(Throw) + GENERATE_OVERFLOW_MODE_CASE(Ignore) + GENERATE_OVERFLOW_MODE_CASE(Saturate) + } + } +#undef GENERATE_OVERFLOW_MODE_CASE + else + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count); + + return true; + }; + + if (isDateTime64(arguments)) + { + /// For toDateTime('xxxx-xx-xx xx:xx:xx.00', 2[, 'timezone']) we need to it convert to DateTime64 + const ColumnWithTypeAndName & scale_column = arguments[1]; + UInt32 scale = extractToDecimalScale(scale_column); + + if (to_datetime64 || scale != 0) /// When scale = 0, the data type is DateTime otherwise the data type is DateTime64 + { + if (!callOnIndexAndDataType(from_type->getTypeId(), call, ConvertDefaultBehaviorTag{})) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", + arguments[0].type->getName(), getName()); + + return result_column; + } + } + + if constexpr (std::is_same_v) + { + if (from_type->getCustomSerialization()) + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + } + + bool done = false; + if constexpr (to_string_or_fixed_string) + { + done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertDefaultBehaviorTag{}); + } + else + { + bool cast_ipv4_ipv6_default_on_conversion_error = false; + if constexpr (is_any_of) + if (context && (cast_ipv4_ipv6_default_on_conversion_error = context->getSettingsRef().cast_ipv4_ipv6_default_on_conversion_error)) + done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertReturnZeroOnErrorTag{}); + + if (!cast_ipv4_ipv6_default_on_conversion_error) + { + /// We should use ConvertFromStringExceptionMode::Null mode when converting from String (or FixedString) + /// to Nullable type, to avoid 'value is too short' error on attempt to parse empty string from NULL values. + if (to_nullable && WhichDataType(from_type).isStringOrFixedString()) + done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertReturnNullOnErrorTag{}); + else + done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertDefaultBehaviorTag{}); + } + } + + if (!done) + { + /// Generic conversion of any type to String. + if (std::is_same_v) + { + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + } + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", + arguments[0].type->getName(), getName()); + } + + return result_column; + } +}; + + +/** Function toTOrZero (where T is number of date or datetime type): + * try to convert from String to type T through parsing, + * if cannot parse, return default value instead of throwing exception. + * Function toTOrNull will return Nullable type with NULL when cannot parse. + * NOTE Also need to implement tryToUnixTimestamp with timezone. + */ +template +class FunctionConvertFromString : public IFunction +{ +public: + static constexpr auto name = Name::name; + static constexpr bool to_decimal = + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v>; + + static constexpr bool to_datetime64 = std::is_same_v; + + static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create() { return std::make_shared(); } + + String getName() const override + { + return name; + } + + bool isVariadic() const override { return true; } + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + size_t getNumberOfArguments() const override { return 0; } + + bool useDefaultImplementationForConstants() const override { return true; } + bool canBeExecutedOnDefaultArguments() const override { return false; } + + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + + DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override + { + DataTypePtr res; + + if (isDateTime64(arguments)) + { + validateFunctionArgumentTypes(*this, arguments, + FunctionArgumentDescriptors{{"string", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"}}, + // optional + FunctionArgumentDescriptors{ + {"precision", static_cast(&isUInt8), isColumnConst, "const UInt8"}, + {"timezone", static_cast(&isStringOrFixedString), isColumnConst, "const String or FixedString"}, + }); + + UInt64 scale = to_datetime64 ? DataTypeDateTime64::default_scale : 0; + if (arguments.size() > 1) + scale = extractToDecimalScale(arguments[1]); + const auto timezone = extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false); + + res = scale == 0 ? res = std::make_shared(timezone) : std::make_shared(scale, timezone); + } + else + { + if ((arguments.size() != 1 && arguments.size() != 2) || (to_decimal && arguments.size() != 2)) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2. " + "Second argument only make sense for DateTime (time zone, optional) and Decimal (scale).", + getName(), arguments.size()); + + if (!isStringOrFixedString(arguments[0].type)) + { + if (this->getName().find("OrZero") != std::string::npos || + this->getName().find("OrNull") != std::string::npos) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " + "Conversion functions with postfix 'OrZero' or 'OrNull' should take String argument", + arguments[0].type->getName(), getName()); + else + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}", + arguments[0].type->getName(), getName()); + } + + if (arguments.size() == 2) + { + if constexpr (std::is_same_v) + { + if (!isString(arguments[1].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}", + arguments[1].type->getName(), getName()); + } + else if constexpr (to_decimal) + { + if (!isInteger(arguments[1].type)) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}", + arguments[1].type->getName(), getName()); + if (!arguments[1].column) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant", getName()); + } + else + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Number of arguments for function {} doesn't match: passed {}, should be 1. " + "Second argument makes sense only for DateTime and Decimal.", + getName(), arguments.size()); + } + } + + if constexpr (std::is_same_v) + res = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false)); + else if constexpr (std::is_same_v) + throw Exception(ErrorCodes::LOGICAL_ERROR, "MaterializedMySQL is a bug."); + else if constexpr (to_decimal) + { + UInt64 scale = extractToDecimalScale(arguments[1]); + res = createDecimalMaxPrecision(scale); + if (!res) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Something wrong with toDecimalNNOrZero() or toDecimalNNOrNull()"); + } + else + res = std::make_shared(); + } + + if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) + res = std::make_shared(res); + + return res; + } + + template + ColumnPtr executeInternal(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, UInt32 scale = 0) const + { + const IDataType * from_type = arguments[0].type.get(); + + if (checkAndGetDataType(from_type)) + { + return ConvertThroughParsing::execute( + arguments, result_type, input_rows_count, scale); + } + else if (checkAndGetDataType(from_type)) + { + return ConvertThroughParsing::execute( + arguments, result_type, input_rows_count, scale); + } + + return nullptr; + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + ColumnPtr result_column; + + if constexpr (to_decimal) + result_column = executeInternal(arguments, result_type, input_rows_count, + assert_cast(*removeNullable(result_type)).getScale()); + else + { + if (isDateTime64(arguments)) + { + UInt64 scale = to_datetime64 ? DataTypeDateTime64::default_scale : 0; + if (arguments.size() > 1) + scale = extractToDecimalScale(arguments[1]); + + if (scale == 0) + result_column = executeInternal(arguments, result_type, input_rows_count); + else + { + result_column = executeInternal(arguments, result_type, input_rows_count, static_cast(scale)); + } + } + else + { + result_column = executeInternal(arguments, result_type, input_rows_count); + } + } + + if (!result_column) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. " + "Only String or FixedString argument is accepted for try-conversion function. For other arguments, " + "use function without 'orZero' or 'orNull'.", arguments[0].type->getName(), getName()); + + return result_column; + } +}; + + +/// Monotonicity. + +struct PositiveMonotonicity +{ + static bool has() { return true; } + static IFunction::Monotonicity get(const IDataType &, const Field &, const Field &) + { + return { .is_monotonic = true }; + } +}; + +struct UnknownMonotonicity +{ + static bool has() { return false; } + static IFunction::Monotonicity get(const IDataType &, const Field &, const Field &) + { + return { }; + } +}; + +template +struct ToNumberMonotonicity +{ + static bool has() { return true; } + + static UInt64 divideByRangeOfType(UInt64 x) + { + if constexpr (sizeof(T) < sizeof(UInt64)) + return x >> (sizeof(T) * 8); + else + return 0; + } + + static IFunction::Monotonicity get(const IDataType & type, const Field & left, const Field & right) + { + if (!type.isValueRepresentedByNumber()) + return {}; + + /// If type is same, the conversion is always monotonic. + /// (Enum has separate case, because it is different data type) + if (checkAndGetDataType>(&type) || + checkAndGetDataType>(&type)) + return { .is_monotonic = true, .is_always_monotonic = true }; + + /// Float cases. + + /// When converting to Float, the conversion is always monotonic. + if constexpr (std::is_floating_point_v) + return { .is_monotonic = true, .is_always_monotonic = true }; + + const auto * low_cardinality = typeid_cast(&type); + const IDataType * low_cardinality_dictionary_type = nullptr; + if (low_cardinality) + low_cardinality_dictionary_type = low_cardinality->getDictionaryType().get(); + + WhichDataType which_type(type); + WhichDataType which_inner_type = low_cardinality + ? WhichDataType(low_cardinality_dictionary_type) + : WhichDataType(type); + + /// If converting from Float, for monotonicity, arguments must fit in range of result type. + if (which_inner_type.isFloat()) + { + if (left.isNull() || right.isNull()) + return {}; + + Float64 left_float = left.get(); + Float64 right_float = right.get(); + + if (left_float >= static_cast(std::numeric_limits::min()) + && left_float <= static_cast(std::numeric_limits::max()) + && right_float >= static_cast(std::numeric_limits::min()) + && right_float <= static_cast(std::numeric_limits::max())) + return { .is_monotonic = true }; + + return {}; + } + + /// Integer cases. + + /// Only support types represented by native integers. + /// It can be extended to big integers, decimals and DateTime64 later. + /// By the way, NULLs are representing unbounded ranges. + if (!((left.isNull() || left.getType() == Field::Types::UInt64 || left.getType() == Field::Types::Int64) + && (right.isNull() || right.getType() == Field::Types::UInt64 || right.getType() == Field::Types::Int64))) + return {}; + + const bool from_is_unsigned = type.isValueRepresentedByUnsignedInteger(); + const bool to_is_unsigned = is_unsigned_v; + + const size_t size_of_from = type.getSizeOfValueInMemory(); + const size_t size_of_to = sizeof(T); + + const bool left_in_first_half = left.isNull() + ? from_is_unsigned + : (left.get() >= 0); + + const bool right_in_first_half = right.isNull() + ? !from_is_unsigned + : (right.get() >= 0); + + /// Size of type is the same. + if (size_of_from == size_of_to) + { + if (from_is_unsigned == to_is_unsigned) + return { .is_monotonic = true, .is_always_monotonic = true }; + + if (left_in_first_half == right_in_first_half) + return { .is_monotonic = true }; + + return {}; + } + + /// Size of type is expanded. + if (size_of_from < size_of_to) + { + if (from_is_unsigned == to_is_unsigned) + return { .is_monotonic = true, .is_always_monotonic = true }; + + if (!to_is_unsigned) + return { .is_monotonic = true, .is_always_monotonic = true }; + + /// signed -> unsigned. If arguments from the same half, then function is monotonic. + if (left_in_first_half == right_in_first_half) + return { .is_monotonic = true }; + + return {}; + } + + /// Size of type is shrunk. + if (size_of_from > size_of_to) + { + /// Function cannot be monotonic on unbounded ranges. + if (left.isNull() || right.isNull()) + return {}; + + /// Function cannot be monotonic when left and right are not on the same ranges. + if (divideByRangeOfType(left.get()) != divideByRangeOfType(right.get())) + return {}; + + if (to_is_unsigned) + return { .is_monotonic = true }; + else + { + // If To is signed, it's possible that the signedness is different after conversion. So we check it explicitly. + const bool is_monotonic = (T(left.get()) >= 0) == (T(right.get()) >= 0); + + return { .is_monotonic = is_monotonic }; + } + } + + UNREACHABLE(); + } +}; + +struct ToDateMonotonicity +{ + static bool has() { return true; } + + static IFunction::Monotonicity get(const IDataType & type, const Field & left, const Field & right) + { + auto which = WhichDataType(type); + if (which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() || which.isInt8() || which.isInt16() || which.isUInt8() + || which.isUInt16()) + { + return {.is_monotonic = true, .is_always_monotonic = true}; + } + else if ( + ((left.getType() == Field::Types::UInt64 || left.isNull()) && (right.getType() == Field::Types::UInt64 || right.isNull()) + && ((left.isNull() || left.get() < 0xFFFF) && (right.isNull() || right.get() >= 0xFFFF))) + || ((left.getType() == Field::Types::Int64 || left.isNull()) && (right.getType() == Field::Types::Int64 || right.isNull()) + && ((left.isNull() || left.get() < 0xFFFF) && (right.isNull() || right.get() >= 0xFFFF))) + || (( + (left.getType() == Field::Types::Float64 || left.isNull()) + && (right.getType() == Field::Types::Float64 || right.isNull()) + && ((left.isNull() || left.get() < 0xFFFF) && (right.isNull() || right.get() >= 0xFFFF)))) + || !isNativeNumber(type)) + { + return {}; + } + else + { + return {.is_monotonic = true, .is_always_monotonic = true}; + } + } +}; + +struct ToDateTimeMonotonicity +{ + static bool has() { return true; } + + static IFunction::Monotonicity get(const IDataType & type, const Field &, const Field &) + { + if (type.isValueRepresentedByNumber()) + return {.is_monotonic = true, .is_always_monotonic = true}; + else + return {}; + } +}; + +/** The monotonicity for the `toString` function is mainly determined for test purposes. + * It is doubtful that anyone is looking to optimize queries with conditions `toString(CounterID) = 34`. + */ +struct ToStringMonotonicity +{ + static bool has() { return true; } + + static IFunction::Monotonicity get(const IDataType & type, const Field & left, const Field & right) + { + IFunction::Monotonicity positive{ .is_monotonic = true }; + IFunction::Monotonicity not_monotonic; + + const auto * type_ptr = &type; + if (const auto * low_cardinality_type = checkAndGetDataType(type_ptr)) + type_ptr = low_cardinality_type->getDictionaryType().get(); + + /// Order on enum values (which is the order on integers) is completely arbitrary in respect to the order on strings. + if (WhichDataType(type).isEnum()) + return not_monotonic; + + /// `toString` function is monotonous if the argument is Date or Date32 or DateTime or String, or non-negative numbers with the same number of symbols. + if (checkDataTypes(type_ptr)) + return positive; + + if (left.isNull() || right.isNull()) + return {}; + + if (left.getType() == Field::Types::UInt64 + && right.getType() == Field::Types::UInt64) + { + return (left.get() == 0 && right.get() == 0) + || (floor(log10(left.get())) == floor(log10(right.get()))) + ? positive : not_monotonic; + } + + if (left.getType() == Field::Types::Int64 + && right.getType() == Field::Types::Int64) + { + return (left.get() == 0 && right.get() == 0) + || (left.get() > 0 && right.get() > 0 && floor(log10(left.get())) == floor(log10(right.get()))) + ? positive : not_monotonic; + } + + return not_monotonic; + } +}; + + +struct NameToUInt8 { static constexpr auto name = "toUInt8"; }; +struct NameToUInt16 { static constexpr auto name = "toUInt16"; }; +struct NameToUInt32 { static constexpr auto name = "toUInt32"; }; +struct NameToUInt64 { static constexpr auto name = "toUInt64"; }; +struct NameToUInt128 { static constexpr auto name = "toUInt128"; }; +struct NameToUInt256 { static constexpr auto name = "toUInt256"; }; +struct NameToInt8 { static constexpr auto name = "toInt8"; }; +struct NameToInt16 { static constexpr auto name = "toInt16"; }; +struct NameToInt32 { static constexpr auto name = "toInt32"; }; +struct NameToInt64 { static constexpr auto name = "toInt64"; }; +struct NameToInt128 { static constexpr auto name = "toInt128"; }; +struct NameToInt256 { static constexpr auto name = "toInt256"; }; +struct NameToFloat32 { static constexpr auto name = "toFloat32"; }; +struct NameToFloat64 { static constexpr auto name = "toFloat64"; }; +struct NameToUUID { static constexpr auto name = "toUUID"; }; +struct NameToIPv4 { static constexpr auto name = "toIPv4"; }; +struct NameToIPv6 { static constexpr auto name = "toIPv6"; }; + +using FunctionToUInt8 = FunctionConvert>; +using FunctionToUInt16 = FunctionConvert>; +using FunctionToUInt32 = FunctionConvert>; +using FunctionToUInt64 = FunctionConvert>; +using FunctionToUInt128 = FunctionConvert>; +using FunctionToUInt256 = FunctionConvert>; +using FunctionToInt8 = FunctionConvert>; +using FunctionToInt16 = FunctionConvert>; +using FunctionToInt32 = FunctionConvert>; +using FunctionToInt64 = FunctionConvert>; +using FunctionToInt128 = FunctionConvert>; +using FunctionToInt256 = FunctionConvert>; +using FunctionToFloat32 = FunctionConvert>; +using FunctionToFloat64 = FunctionConvert>; + +using FunctionToDate = FunctionConvert; + +using FunctionToDate32 = FunctionConvert; + +using FunctionToDateTime = FunctionConvert; + +using FunctionToDateTime32 = FunctionConvert; + +using FunctionToDateTime64 = FunctionConvert; + +using FunctionToUUID = FunctionConvert>; +using FunctionToIPv4 = FunctionConvert>; +using FunctionToIPv6 = FunctionConvert>; +using FunctionToString = FunctionConvert; +using FunctionToUnixTimestamp = FunctionConvert>; +using FunctionToDecimal32 = FunctionConvert, NameToDecimal32, UnknownMonotonicity>; +using FunctionToDecimal64 = FunctionConvert, NameToDecimal64, UnknownMonotonicity>; +using FunctionToDecimal128 = FunctionConvert, NameToDecimal128, UnknownMonotonicity>; +using FunctionToDecimal256 = FunctionConvert, NameToDecimal256, UnknownMonotonicity>; + +template struct FunctionTo; + +template <> struct FunctionTo { using Type = FunctionToUInt8; }; +template <> struct FunctionTo { using Type = FunctionToUInt16; }; +template <> struct FunctionTo { using Type = FunctionToUInt32; }; +template <> struct FunctionTo { using Type = FunctionToUInt64; }; +template <> struct FunctionTo { using Type = FunctionToUInt128; }; +template <> struct FunctionTo { using Type = FunctionToUInt256; }; +template <> struct FunctionTo { using Type = FunctionToInt8; }; +template <> struct FunctionTo { using Type = FunctionToInt16; }; +template <> struct FunctionTo { using Type = FunctionToInt32; }; +template <> struct FunctionTo { using Type = FunctionToInt64; }; +template <> struct FunctionTo { using Type = FunctionToInt128; }; +template <> struct FunctionTo { using Type = FunctionToInt256; }; +template <> struct FunctionTo { using Type = FunctionToFloat32; }; +template <> struct FunctionTo { using Type = FunctionToFloat64; }; + +template +struct FunctionTo { using Type = FunctionToDate; }; + +template +struct FunctionTo { using Type = FunctionToDate32; }; + +template +struct FunctionTo { using Type = FunctionToDateTime; }; + +template +struct FunctionTo { using Type = FunctionToDateTime64; }; + +template <> struct FunctionTo { using Type = FunctionToUUID; }; +template <> struct FunctionTo { using Type = FunctionToIPv4; }; +template <> struct FunctionTo { using Type = FunctionToIPv6; }; +template <> struct FunctionTo { using Type = FunctionToString; }; +template <> struct FunctionTo { using Type = FunctionToFixedString; }; +template <> struct FunctionTo> { using Type = FunctionToDecimal32; }; +template <> struct FunctionTo> { using Type = FunctionToDecimal64; }; +template <> struct FunctionTo> { using Type = FunctionToDecimal128; }; +template <> struct FunctionTo> { using Type = FunctionToDecimal256; }; + +template struct FunctionTo> + : FunctionTo> +{ +}; + +struct NameToUInt8OrZero { static constexpr auto name = "toUInt8OrZero"; }; +struct NameToUInt16OrZero { static constexpr auto name = "toUInt16OrZero"; }; +struct NameToUInt32OrZero { static constexpr auto name = "toUInt32OrZero"; }; +struct NameToUInt64OrZero { static constexpr auto name = "toUInt64OrZero"; }; +struct NameToUInt128OrZero { static constexpr auto name = "toUInt128OrZero"; }; +struct NameToUInt256OrZero { static constexpr auto name = "toUInt256OrZero"; }; +struct NameToInt8OrZero { static constexpr auto name = "toInt8OrZero"; }; +struct NameToInt16OrZero { static constexpr auto name = "toInt16OrZero"; }; +struct NameToInt32OrZero { static constexpr auto name = "toInt32OrZero"; }; +struct NameToInt64OrZero { static constexpr auto name = "toInt64OrZero"; }; +struct NameToInt128OrZero { static constexpr auto name = "toInt128OrZero"; }; +struct NameToInt256OrZero { static constexpr auto name = "toInt256OrZero"; }; +struct NameToFloat32OrZero { static constexpr auto name = "toFloat32OrZero"; }; +struct NameToFloat64OrZero { static constexpr auto name = "toFloat64OrZero"; }; +struct NameToDateOrZero { static constexpr auto name = "toDateOrZero"; }; +struct NameToDate32OrZero { static constexpr auto name = "toDate32OrZero"; }; +struct NameToDateTimeOrZero { static constexpr auto name = "toDateTimeOrZero"; }; +struct NameToDateTime64OrZero { static constexpr auto name = "toDateTime64OrZero"; }; +struct NameToDecimal32OrZero { static constexpr auto name = "toDecimal32OrZero"; }; +struct NameToDecimal64OrZero { static constexpr auto name = "toDecimal64OrZero"; }; +struct NameToDecimal128OrZero { static constexpr auto name = "toDecimal128OrZero"; }; +struct NameToDecimal256OrZero { static constexpr auto name = "toDecimal256OrZero"; }; +struct NameToUUIDOrZero { static constexpr auto name = "toUUIDOrZero"; }; +struct NameToIPv4OrZero { static constexpr auto name = "toIPv4OrZero"; }; +struct NameToIPv6OrZero { static constexpr auto name = "toIPv6OrZero"; }; + +using FunctionToUInt8OrZero = FunctionConvertFromString; +using FunctionToUInt16OrZero = FunctionConvertFromString; +using FunctionToUInt32OrZero = FunctionConvertFromString; +using FunctionToUInt64OrZero = FunctionConvertFromString; +using FunctionToUInt128OrZero = FunctionConvertFromString; +using FunctionToUInt256OrZero = FunctionConvertFromString; +using FunctionToInt8OrZero = FunctionConvertFromString; +using FunctionToInt16OrZero = FunctionConvertFromString; +using FunctionToInt32OrZero = FunctionConvertFromString; +using FunctionToInt64OrZero = FunctionConvertFromString; +using FunctionToInt128OrZero = FunctionConvertFromString; +using FunctionToInt256OrZero = FunctionConvertFromString; +using FunctionToFloat32OrZero = FunctionConvertFromString; +using FunctionToFloat64OrZero = FunctionConvertFromString; +using FunctionToDateOrZero = FunctionConvertFromString; +using FunctionToDate32OrZero = FunctionConvertFromString; +using FunctionToDateTimeOrZero = FunctionConvertFromString; +using FunctionToDateTime64OrZero = FunctionConvertFromString; +using FunctionToDecimal32OrZero = FunctionConvertFromString, NameToDecimal32OrZero, ConvertFromStringExceptionMode::Zero>; +using FunctionToDecimal64OrZero = FunctionConvertFromString, NameToDecimal64OrZero, ConvertFromStringExceptionMode::Zero>; +using FunctionToDecimal128OrZero = FunctionConvertFromString, NameToDecimal128OrZero, ConvertFromStringExceptionMode::Zero>; +using FunctionToDecimal256OrZero = FunctionConvertFromString, NameToDecimal256OrZero, ConvertFromStringExceptionMode::Zero>; +using FunctionToUUIDOrZero = FunctionConvertFromString; +using FunctionToIPv4OrZero = FunctionConvertFromString; +using FunctionToIPv6OrZero = FunctionConvertFromString; + +struct NameToUInt8OrNull { static constexpr auto name = "toUInt8OrNull"; }; +struct NameToUInt16OrNull { static constexpr auto name = "toUInt16OrNull"; }; +struct NameToUInt32OrNull { static constexpr auto name = "toUInt32OrNull"; }; +struct NameToUInt64OrNull { static constexpr auto name = "toUInt64OrNull"; }; +struct NameToUInt128OrNull { static constexpr auto name = "toUInt128OrNull"; }; +struct NameToUInt256OrNull { static constexpr auto name = "toUInt256OrNull"; }; +struct NameToInt8OrNull { static constexpr auto name = "toInt8OrNull"; }; +struct NameToInt16OrNull { static constexpr auto name = "toInt16OrNull"; }; +struct NameToInt32OrNull { static constexpr auto name = "toInt32OrNull"; }; +struct NameToInt64OrNull { static constexpr auto name = "toInt64OrNull"; }; +struct NameToInt128OrNull { static constexpr auto name = "toInt128OrNull"; }; +struct NameToInt256OrNull { static constexpr auto name = "toInt256OrNull"; }; +struct NameToFloat32OrNull { static constexpr auto name = "toFloat32OrNull"; }; +struct NameToFloat64OrNull { static constexpr auto name = "toFloat64OrNull"; }; +struct NameToDateOrNull { static constexpr auto name = "toDateOrNull"; }; +struct NameToDate32OrNull { static constexpr auto name = "toDate32OrNull"; }; +struct NameToDateTimeOrNull { static constexpr auto name = "toDateTimeOrNull"; }; +struct NameToDateTime64OrNull { static constexpr auto name = "toDateTime64OrNull"; }; +struct NameToDecimal32OrNull { static constexpr auto name = "toDecimal32OrNull"; }; +struct NameToDecimal64OrNull { static constexpr auto name = "toDecimal64OrNull"; }; +struct NameToDecimal128OrNull { static constexpr auto name = "toDecimal128OrNull"; }; +struct NameToDecimal256OrNull { static constexpr auto name = "toDecimal256OrNull"; }; +struct NameToUUIDOrNull { static constexpr auto name = "toUUIDOrNull"; }; +struct NameToIPv4OrNull { static constexpr auto name = "toIPv4OrNull"; }; +struct NameToIPv6OrNull { static constexpr auto name = "toIPv6OrNull"; }; + +using FunctionToUInt8OrNull = FunctionConvertFromString; +using FunctionToUInt16OrNull = FunctionConvertFromString; +using FunctionToUInt32OrNull = FunctionConvertFromString; +using FunctionToUInt64OrNull = FunctionConvertFromString; +using FunctionToUInt128OrNull = FunctionConvertFromString; +using FunctionToUInt256OrNull = FunctionConvertFromString; +using FunctionToInt8OrNull = FunctionConvertFromString; +using FunctionToInt16OrNull = FunctionConvertFromString; +using FunctionToInt32OrNull = FunctionConvertFromString; +using FunctionToInt64OrNull = FunctionConvertFromString; +using FunctionToInt128OrNull = FunctionConvertFromString; +using FunctionToInt256OrNull = FunctionConvertFromString; +using FunctionToFloat32OrNull = FunctionConvertFromString; +using FunctionToFloat64OrNull = FunctionConvertFromString; +using FunctionToDateOrNull = FunctionConvertFromString; +using FunctionToDate32OrNull = FunctionConvertFromString; +using FunctionToDateTimeOrNull = FunctionConvertFromString; +using FunctionToDateTime64OrNull = FunctionConvertFromString; +using FunctionToDecimal32OrNull = FunctionConvertFromString, NameToDecimal32OrNull, ConvertFromStringExceptionMode::Null>; +using FunctionToDecimal64OrNull = FunctionConvertFromString, NameToDecimal64OrNull, ConvertFromStringExceptionMode::Null>; +using FunctionToDecimal128OrNull = FunctionConvertFromString, NameToDecimal128OrNull, ConvertFromStringExceptionMode::Null>; +using FunctionToDecimal256OrNull = FunctionConvertFromString, NameToDecimal256OrNull, ConvertFromStringExceptionMode::Null>; +using FunctionToUUIDOrNull = FunctionConvertFromString; +using FunctionToIPv4OrNull = FunctionConvertFromString; +using FunctionToIPv6OrNull = FunctionConvertFromString; + +struct NameParseDateTimeBestEffort { static constexpr auto name = "parseDateTimeBestEffort"; }; +struct NameParseDateTimeBestEffortOrZero { static constexpr auto name = "parseDateTimeBestEffortOrZero"; }; +struct NameParseDateTimeBestEffortOrNull { static constexpr auto name = "parseDateTimeBestEffortOrNull"; }; +struct NameParseDateTimeBestEffortUS { static constexpr auto name = "parseDateTimeBestEffortUS"; }; +struct NameParseDateTimeBestEffortUSOrZero { static constexpr auto name = "parseDateTimeBestEffortUSOrZero"; }; +struct NameParseDateTimeBestEffortUSOrNull { static constexpr auto name = "parseDateTimeBestEffortUSOrNull"; }; +struct NameParseDateTime32BestEffort { static constexpr auto name = "parseDateTime32BestEffort"; }; +struct NameParseDateTime32BestEffortOrZero { static constexpr auto name = "parseDateTime32BestEffortOrZero"; }; +struct NameParseDateTime32BestEffortOrNull { static constexpr auto name = "parseDateTime32BestEffortOrNull"; }; +struct NameParseDateTime64BestEffort { static constexpr auto name = "parseDateTime64BestEffort"; }; +struct NameParseDateTime64BestEffortOrZero { static constexpr auto name = "parseDateTime64BestEffortOrZero"; }; +struct NameParseDateTime64BestEffortOrNull { static constexpr auto name = "parseDateTime64BestEffortOrNull"; }; +struct NameParseDateTime64BestEffortUS { static constexpr auto name = "parseDateTime64BestEffortUS"; }; +struct NameParseDateTime64BestEffortUSOrZero { static constexpr auto name = "parseDateTime64BestEffortUSOrZero"; }; +struct NameParseDateTime64BestEffortUSOrNull { static constexpr auto name = "parseDateTime64BestEffortUSOrNull"; }; + + +using FunctionParseDateTimeBestEffort = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTimeBestEffort, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTimeBestEffortOrZero = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTimeBestEffortOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTimeBestEffortOrNull = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTimeBestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; + +using FunctionParseDateTimeBestEffortUS = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTimeBestEffortUS, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffortUS>; +using FunctionParseDateTimeBestEffortUSOrZero = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTimeBestEffortUSOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffortUS>; +using FunctionParseDateTimeBestEffortUSOrNull = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTimeBestEffortUSOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffortUS>; + +using FunctionParseDateTime32BestEffort = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTime32BestEffort, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTime32BestEffortOrZero = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTime32BestEffortOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTime32BestEffortOrNull = FunctionConvertFromString< + DataTypeDateTime, NameParseDateTime32BestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; + +using FunctionParseDateTime64BestEffort = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffort, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTime64BestEffortOrZero = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffort>; +using FunctionParseDateTime64BestEffortOrNull = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; + +using FunctionParseDateTime64BestEffortUS = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortUS, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffortUS>; +using FunctionParseDateTime64BestEffortUSOrZero = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortUSOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffortUS>; +using FunctionParseDateTime64BestEffortUSOrNull = FunctionConvertFromString< + DataTypeDateTime64, NameParseDateTime64BestEffortUSOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffortUS>; + + +class ExecutableFunctionCast : public IExecutableFunction +{ +public: + using WrapperType = std::function; + + explicit ExecutableFunctionCast( + WrapperType && wrapper_function_, const char * name_, std::optional diagnostic_) + : wrapper_function(std::move(wrapper_function_)), name(name_), diagnostic(std::move(diagnostic_)) {} + + String getName() const override { return name; } + +protected: + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + { + /// drop second argument, pass others + ColumnsWithTypeAndName new_arguments{arguments.front()}; + if (arguments.size() > 2) + new_arguments.insert(std::end(new_arguments), std::next(std::begin(arguments), 2), std::end(arguments)); + + try + { + return wrapper_function(new_arguments, result_type, nullptr, input_rows_count); + } + catch (Exception & e) + { + if (diagnostic) + e.addMessage("while converting source column " + backQuoteIfNeed(diagnostic->column_from) + + " to destination column " + backQuoteIfNeed(diagnostic->column_to)); + throw; + } + } + + bool useDefaultImplementationForNulls() const override { return false; } + /// CAST(Nothing, T) -> T + bool useDefaultImplementationForNothing() const override { return false; } + bool useDefaultImplementationForConstants() const override { return true; } + bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } + +private: + WrapperType wrapper_function; + const char * name; + std::optional diagnostic; +}; + + +struct FunctionCastName +{ + static constexpr auto name = "CAST"; +}; + +class FunctionCastBase : public IFunctionBase +{ +public: + using MonotonicityForRange = std::function; +}; + +class FunctionCast final : public FunctionCastBase +{ +public: + using WrapperType = std::function; + + FunctionCast(ContextPtr context_ + , const char * cast_name_ + , MonotonicityForRange && monotonicity_for_range_ + , const DataTypes & argument_types_ + , const DataTypePtr & return_type_ + , std::optional diagnostic_ + , CastType cast_type_) + : cast_name(cast_name_), monotonicity_for_range(std::move(monotonicity_for_range_)) + , argument_types(argument_types_), return_type(return_type_), diagnostic(std::move(diagnostic_)) + , cast_type(cast_type_) + , context(context_) + { + } + + const DataTypes & getArgumentTypes() const override { return argument_types; } + const DataTypePtr & getResultType() const override { return return_type; } + + ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName & /*sample_columns*/) const override + { + try + { + return std::make_unique( + prepareUnpackDictionaries(getArgumentTypes()[0], getResultType()), cast_name, diagnostic); + } + catch (Exception & e) + { + if (diagnostic) + e.addMessage("while converting source column " + backQuoteIfNeed(diagnostic->column_from) + + " to destination column " + backQuoteIfNeed(diagnostic->column_to)); + throw; + } + } + + String getName() const override { return cast_name; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + bool hasInformationAboutMonotonicity() const override + { + return static_cast(monotonicity_for_range); + } + + Monotonicity getMonotonicityForRange(const IDataType & type, const Field & left, const Field & right) const override + { + return monotonicity_for_range(type, left, right); + } + +private: + + const char * cast_name; + MonotonicityForRange monotonicity_for_range; + + DataTypes argument_types; + DataTypePtr return_type; + + std::optional diagnostic; + CastType cast_type; + ContextPtr context; + + static WrapperType createFunctionAdaptor(FunctionPtr function, const DataTypePtr & from_type) + { + auto function_adaptor = std::make_unique(function)->build({ColumnWithTypeAndName{nullptr, from_type, ""}}); + + return [function_adaptor] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) + { + return function_adaptor->execute(arguments, result_type, input_rows_count); + }; + } + + static WrapperType createToNullableColumnWrapper() + { + return [] (ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) + { + ColumnPtr res = result_type->createColumn(); + ColumnUInt8::Ptr col_null_map_to = ColumnUInt8::create(input_rows_count, true); + return ColumnNullable::create(res->cloneResized(input_rows_count), std::move(col_null_map_to)); + }; + } + + template + WrapperType createWrapper(const DataTypePtr & from_type, const ToDataType * const to_type, bool requested_result_is_nullable) const + { + TypeIndex from_type_index = from_type->getTypeId(); + WhichDataType which(from_type_index); + bool can_apply_accurate_cast = (cast_type == CastType::accurate || cast_type == CastType::accurateOrNull) + && (which.isInt() || which.isUInt() || which.isFloat()); + + FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior = default_date_time_overflow_behavior; + if (context) + date_time_overflow_behavior = context->getSettingsRef().date_time_overflow_behavior; + + if (requested_result_is_nullable && checkAndGetDataType(from_type.get())) + { + /// In case when converting to Nullable type, we apply different parsing rule, + /// that will not throw an exception but return NULL in case of malformed input. + FunctionPtr function = FunctionConvertFromString::create(); + return createFunctionAdaptor(function, from_type); + } + else if (!can_apply_accurate_cast) + { + FunctionPtr function = FunctionTo::Type::create(context); + return createFunctionAdaptor(function, from_type); + } + + auto wrapper_cast_type = cast_type; + + return [wrapper_cast_type, from_type_index, to_type, date_time_overflow_behavior] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *column_nullable, size_t input_rows_count) + { + ColumnPtr result_column; + auto res = callOnIndexAndDataType(from_type_index, [&](const auto & types) -> bool { + using Types = std::decay_t; + using LeftDataType = typename Types::LeftType; + using RightDataType = typename Types::RightType; + + if constexpr (IsDataTypeNumber) + { + if constexpr (IsDataTypeNumber) + { +#define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE, ADDITIONS) \ + case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ + result_column = ConvertImpl::execute( \ + arguments, result_type, input_rows_count, ADDITIONS()); \ + break; + if (wrapper_cast_type == CastType::accurate) + { + switch (date_time_overflow_behavior) + { + GENERATE_OVERFLOW_MODE_CASE(Throw, AccurateConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Ignore, AccurateConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Saturate, AccurateConvertStrategyAdditions) + } + } + else + { + switch (date_time_overflow_behavior) + { + GENERATE_OVERFLOW_MODE_CASE(Throw, AccurateOrNullConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Ignore, AccurateOrNullConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Saturate, AccurateOrNullConvertStrategyAdditions) + } + } +#undef GENERATE_OVERFLOW_MODE_CASE + + return true; + } + + if constexpr (std::is_same_v || std::is_same_v) + { +#define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE, ADDITIONS) \ + case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ + result_column = ConvertImpl::template execute( \ +arguments, result_type, input_rows_count); \ + break; + if (wrapper_cast_type == CastType::accurate) + { + switch (date_time_overflow_behavior) + { + GENERATE_OVERFLOW_MODE_CASE(Throw, DateTimeAccurateConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Ignore, DateTimeAccurateConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Saturate, DateTimeAccurateConvertStrategyAdditions) + } + } + else + { + switch (date_time_overflow_behavior) + { + GENERATE_OVERFLOW_MODE_CASE(Throw, DateTimeAccurateOrNullConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Ignore, DateTimeAccurateOrNullConvertStrategyAdditions) + GENERATE_OVERFLOW_MODE_CASE(Saturate, DateTimeAccurateOrNullConvertStrategyAdditions) + } + } +#undef GENERATE_OVERFLOW_MODE_CASE + return true; + } + } + + return false; + }); + + /// Additionally check if callOnIndexAndDataType wasn't called at all. + if (!res) + { + if (wrapper_cast_type == CastType::accurateOrNull) + { + auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); + return nullable_column_wrapper(arguments, result_type, column_nullable, input_rows_count); + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, + "Conversion from {} to {} is not supported", + from_type_index, to_type->getName()); + } + } + + return result_column; + }; + } + + template + WrapperType createBoolWrapper(const DataTypePtr & from_type, const ToDataType * const to_type, bool requested_result_is_nullable) const + { + if (checkAndGetDataType(from_type.get())) + { + return &ConvertImplGenericFromString::execute; + } + + return createWrapper(from_type, to_type, requested_result_is_nullable); + } + + WrapperType createUInt8ToBoolWrapper(const DataTypePtr from_type, const DataTypePtr to_type) const + { + return [from_type, to_type] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) -> ColumnPtr + { + /// Special case when we convert UInt8 column to Bool column. + /// both columns have type UInt8, but we shouldn't use identity wrapper, + /// because Bool column can contain only 0 and 1. + auto res_column = to_type->createColumn(); + const auto & data_from = checkAndGetColumn(arguments[0].column.get())->getData(); + auto & data_to = assert_cast(res_column.get())->getData(); + data_to.resize(data_from.size()); + for (size_t i = 0; i != data_from.size(); ++i) + data_to[i] = static_cast(data_from[i]); + return res_column; + }; + } + + static WrapperType createStringWrapper(const DataTypePtr & from_type) + { + FunctionPtr function = FunctionToString::create(); + return createFunctionAdaptor(function, from_type); + } + + WrapperType createFixedStringWrapper(const DataTypePtr & from_type, const size_t N) const + { + if (!isStringOrFixedString(from_type)) + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "CAST AS FixedString is only implemented for types String and FixedString"); + + bool exception_mode_null = cast_type == CastType::accurateOrNull; + return [exception_mode_null, N] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) + { + if (exception_mode_null) + return FunctionToFixedString::executeForN(arguments, N); + else + return FunctionToFixedString::executeForN(arguments, N); + }; + } + +#define GENERATE_INTERVAL_CASE(INTERVAL_KIND) \ + case IntervalKind::Kind::INTERVAL_KIND: \ + return createFunctionAdaptor(FunctionConvert::create(), from_type); + + static WrapperType createIntervalWrapper(const DataTypePtr & from_type, IntervalKind kind) + { + switch (kind) + { + GENERATE_INTERVAL_CASE(Nanosecond) + GENERATE_INTERVAL_CASE(Microsecond) + GENERATE_INTERVAL_CASE(Millisecond) + GENERATE_INTERVAL_CASE(Second) + GENERATE_INTERVAL_CASE(Minute) + GENERATE_INTERVAL_CASE(Hour) + GENERATE_INTERVAL_CASE(Day) + GENERATE_INTERVAL_CASE(Week) + GENERATE_INTERVAL_CASE(Month) + GENERATE_INTERVAL_CASE(Quarter) + GENERATE_INTERVAL_CASE(Year) + } + throw Exception{ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion to unexpected IntervalKind: {}", kind.toString()}; + } + +#undef GENERATE_INTERVAL_CASE + + template + requires IsDataTypeDecimal + WrapperType createDecimalWrapper(const DataTypePtr & from_type, const ToDataType * to_type, bool requested_result_is_nullable) const + { + TypeIndex type_index = from_type->getTypeId(); + UInt32 scale = to_type->getScale(); + + WhichDataType which(type_index); + bool ok = which.isNativeInt() || which.isNativeUInt() || which.isDecimal() || which.isFloat() || which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() + || which.isStringOrFixedString(); + if (!ok) + { + if (cast_type == CastType::accurateOrNull) + return createToNullableColumnWrapper(); + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", + from_type->getName(), to_type->getName()); + } + + auto wrapper_cast_type = cast_type; + + return [wrapper_cast_type, type_index, scale, to_type, requested_result_is_nullable] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *column_nullable, size_t input_rows_count) + { + ColumnPtr result_column; + auto res = callOnIndexAndDataType(type_index, [&](const auto & types) -> bool + { + using Types = std::decay_t; + using LeftDataType = typename Types::LeftType; + using RightDataType = typename Types::RightType; + + if constexpr (IsDataTypeDecimalOrNumber && IsDataTypeDecimalOrNumber && !std::is_same_v) + { + if (wrapper_cast_type == CastType::accurate) + { + AccurateConvertStrategyAdditions additions; + additions.scale = scale; + result_column = ConvertImpl::execute( + arguments, result_type, input_rows_count, additions); + + return true; + } + else if (wrapper_cast_type == CastType::accurateOrNull) + { + AccurateOrNullConvertStrategyAdditions additions; + additions.scale = scale; + result_column = ConvertImpl::execute( + arguments, result_type, input_rows_count, additions); + + return true; + } + } + else if constexpr (std::is_same_v) + { + if (requested_result_is_nullable) + { + /// Consistent with CAST(Nullable(String) AS Nullable(Numbers)) + /// In case when converting to Nullable type, we apply different parsing rule, + /// that will not throw an exception but return NULL in case of malformed input. + result_column = ConvertImpl::execute( + arguments, result_type, input_rows_count, scale); + + return true; + } + } + + result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); + + return true; + }); + + /// Additionally check if callOnIndexAndDataType wasn't called at all. + if (!res) + { + if (wrapper_cast_type == CastType::accurateOrNull) + { + auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); + return nullable_column_wrapper(arguments, result_type, column_nullable, input_rows_count); + } + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, + "Conversion from {} to {} is not supported", + type_index, to_type->getName()); + } + + return result_column; + }; + } + + WrapperType createAggregateFunctionWrapper(const DataTypePtr & from_type_untyped, const DataTypeAggregateFunction * to_type) const + { + /// Conversion from String through parsing. + if (checkAndGetDataType(from_type_untyped.get())) + { + return &ConvertImplGenericFromString::execute; + } + else if (const auto * agg_type = checkAndGetDataType(from_type_untyped.get())) + { + if (agg_type->getFunction()->haveSameStateRepresentation(*to_type->getFunction())) + { + return [function = to_type->getFunction()]( + ColumnsWithTypeAndName & arguments, + const DataTypePtr & /* result_type */, + const ColumnNullable * /* nullable_source */, + size_t /*input_rows_count*/) -> ColumnPtr + { + const auto & argument_column = arguments.front(); + const auto * col_agg = checkAndGetColumn(argument_column.column.get()); + if (col_agg) + { + auto new_col_agg = ColumnAggregateFunction::create(*col_agg); + new_col_agg->set(function); + return new_col_agg; + } + else + { + throw Exception( + ErrorCodes::LOGICAL_ERROR, + "Illegal column {} for function CAST AS AggregateFunction", + argument_column.column->getName()); + } + }; + } + } + + if (cast_type == CastType::accurateOrNull) + return createToNullableColumnWrapper(); + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", + from_type_untyped->getName(), to_type->getName()); + } + + WrapperType createArrayWrapper(const DataTypePtr & from_type_untyped, const DataTypeArray & to_type) const + { + /// Conversion from String through parsing. + if (checkAndGetDataType(from_type_untyped.get())) + { + return &ConvertImplGenericFromString::execute; + } + + DataTypePtr from_type_holder; + const auto * from_type = checkAndGetDataType(from_type_untyped.get()); + const auto * from_type_map = checkAndGetDataType(from_type_untyped.get()); + + /// Convert from Map + if (from_type_map) + { + /// Recreate array of unnamed tuples because otherwise it may work + /// unexpectedly while converting to array of named tuples. + from_type_holder = from_type_map->getNestedTypeWithUnnamedTuple(); + from_type = assert_cast(from_type_holder.get()); + } + + if (!from_type) + { + throw Exception(ErrorCodes::TYPE_MISMATCH, + "CAST AS Array can only be performed between same-dimensional Array, Map or String types"); + } + + DataTypePtr from_nested_type = from_type->getNestedType(); + + /// In query SELECT CAST([] AS Array(Array(String))) from type is Array(Nothing) + bool from_empty_array = isNothing(from_nested_type); + + if (from_type->getNumberOfDimensions() != to_type.getNumberOfDimensions() && !from_empty_array) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "CAST AS Array can only be performed between same-dimensional array types"); + + const DataTypePtr & to_nested_type = to_type.getNestedType(); + + /// Prepare nested type conversion + const auto nested_function = prepareUnpackDictionaries(from_nested_type, to_nested_type); + + return [nested_function, from_nested_type, to_nested_type]( + ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr + { + const auto & argument_column = arguments.front(); + + const ColumnArray * col_array = nullptr; + + if (const ColumnMap * col_map = checkAndGetColumn(argument_column.column.get())) + col_array = &col_map->getNestedColumn(); + else + col_array = checkAndGetColumn(argument_column.column.get()); + + if (col_array) + { + /// create columns for converting nested column containing original and result columns + ColumnsWithTypeAndName nested_columns{{ col_array->getDataPtr(), from_nested_type, "" }}; + + /// convert nested column + auto result_column = nested_function(nested_columns, to_nested_type, nullable_source, nested_columns.front().column->size()); + + /// set converted nested column to result + return ColumnArray::create(result_column, col_array->getOffsetsPtr()); + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Illegal column {} for function CAST AS Array", + argument_column.column->getName()); + } + }; + } + + using ElementWrappers = std::vector; + + ElementWrappers getElementWrappers(const DataTypes & from_element_types, const DataTypes & to_element_types) const + { + ElementWrappers element_wrappers; + element_wrappers.reserve(from_element_types.size()); + + /// Create conversion wrapper for each element in tuple + for (size_t i = 0; i < from_element_types.size(); ++i) + { + const DataTypePtr & from_element_type = from_element_types[i]; + const DataTypePtr & to_element_type = to_element_types[i]; + element_wrappers.push_back(prepareUnpackDictionaries(from_element_type, to_element_type)); + } + + return element_wrappers; + } + + WrapperType createTupleWrapper(const DataTypePtr & from_type_untyped, const DataTypeTuple * to_type) const + { + /// Conversion from String through parsing. + if (checkAndGetDataType(from_type_untyped.get())) + { + return &ConvertImplGenericFromString::execute; + } + + const auto * from_type = checkAndGetDataType(from_type_untyped.get()); + if (!from_type) + throw Exception(ErrorCodes::TYPE_MISMATCH, "CAST AS Tuple can only be performed between tuple types or from String.\n" + "Left type: {}, right type: {}", from_type_untyped->getName(), to_type->getName()); + + const auto & from_element_types = from_type->getElements(); + const auto & to_element_types = to_type->getElements(); + + std::vector element_wrappers; + std::vector> to_reverse_index; + + /// For named tuples allow conversions for tuples with + /// different sets of elements. If element exists in @to_type + /// and doesn't exist in @to_type it will be filled by default values. + if (from_type->haveExplicitNames() && to_type->haveExplicitNames()) + { + const auto & from_names = from_type->getElementNames(); + std::unordered_map from_positions; + from_positions.reserve(from_names.size()); + for (size_t i = 0; i < from_names.size(); ++i) + from_positions[from_names[i]] = i; + + const auto & to_names = to_type->getElementNames(); + element_wrappers.reserve(to_names.size()); + to_reverse_index.reserve(from_names.size()); + + for (size_t i = 0; i < to_names.size(); ++i) + { + auto it = from_positions.find(to_names[i]); + if (it != from_positions.end()) + { + element_wrappers.emplace_back(prepareUnpackDictionaries(from_element_types[it->second], to_element_types[i])); + to_reverse_index.emplace_back(it->second); + } + else + { + element_wrappers.emplace_back(); + to_reverse_index.emplace_back(); + } + } + } + else + { + if (from_element_types.size() != to_element_types.size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, "CAST AS Tuple can only be performed between tuple types " + "with the same number of elements or from String.\nLeft type: {}, right type: {}", + from_type->getName(), to_type->getName()); + + element_wrappers = getElementWrappers(from_element_types, to_element_types); + to_reverse_index.reserve(to_element_types.size()); + for (size_t i = 0; i < to_element_types.size(); ++i) + to_reverse_index.emplace_back(i); + } + + return [element_wrappers, from_element_types, to_element_types, to_reverse_index] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) -> ColumnPtr + { + const auto * col = arguments.front().column.get(); + + size_t tuple_size = to_element_types.size(); + const ColumnTuple & column_tuple = typeid_cast(*col); + + Columns converted_columns(tuple_size); + + /// invoke conversion for each element + for (size_t i = 0; i < tuple_size; ++i) + { + if (to_reverse_index[i]) + { + size_t from_idx = *to_reverse_index[i]; + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[from_idx], from_element_types[from_idx], "" }}; + converted_columns[i] = element_wrappers[i](element, to_element_types[i], nullable_source, input_rows_count); + } + else + { + converted_columns[i] = to_element_types[i]->createColumn()->cloneResized(input_rows_count); + } + } + + return ColumnTuple::create(converted_columns); + }; + } + + /// The case of: tuple([key1, key2, ..., key_n], [value1, value2, ..., value_n]) + WrapperType createTupleToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const + { + return [element_wrappers = getElementWrappers(from_kv_types, to_kv_types), from_kv_types, to_kv_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr + { + const auto * col = arguments.front().column.get(); + const auto & column_tuple = assert_cast(*col); + + Columns offsets(2); + Columns converted_columns(2); + for (size_t i = 0; i < 2; ++i) + { + const auto & column_array = assert_cast(column_tuple.getColumn(i)); + ColumnsWithTypeAndName element = {{column_array.getDataPtr(), from_kv_types[i], ""}}; + converted_columns[i] = element_wrappers[i](element, to_kv_types[i], nullable_source, (element[0].column)->size()); + offsets[i] = column_array.getOffsetsPtr(); + } + + const auto & keys_offsets = assert_cast(*offsets[0]).getData(); + const auto & values_offsets = assert_cast(*offsets[1]).getData(); + if (keys_offsets != values_offsets) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "CAST AS Map can only be performed from tuple of arrays with equal sizes."); + + return ColumnMap::create(converted_columns[0], converted_columns[1], offsets[0]); + }; + } + + WrapperType createMapToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const + { + return [element_wrappers = getElementWrappers(from_kv_types, to_kv_types), from_kv_types, to_kv_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr + { + const auto * col = arguments.front().column.get(); + const auto & column_map = typeid_cast(*col); + const auto & nested_data = column_map.getNestedData(); + + Columns converted_columns(2); + for (size_t i = 0; i < 2; ++i) + { + ColumnsWithTypeAndName element = {{nested_data.getColumnPtr(i), from_kv_types[i], ""}}; + converted_columns[i] = element_wrappers[i](element, to_kv_types[i], nullable_source, (element[0].column)->size()); + } + + return ColumnMap::create(converted_columns[0], converted_columns[1], column_map.getNestedColumn().getOffsetsPtr()); + }; + } + + /// The case of: [(key1, value1), (key2, value2), ...] + WrapperType createArrayToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const + { + return [element_wrappers = getElementWrappers(from_kv_types, to_kv_types), from_kv_types, to_kv_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr + { + const auto * col = arguments.front().column.get(); + const auto & column_array = typeid_cast(*col); + const auto & nested_data = typeid_cast(column_array.getData()); + + Columns converted_columns(2); + for (size_t i = 0; i < 2; ++i) + { + ColumnsWithTypeAndName element = {{nested_data.getColumnPtr(i), from_kv_types[i], ""}}; + converted_columns[i] = element_wrappers[i](element, to_kv_types[i], nullable_source, (element[0].column)->size()); + } + + return ColumnMap::create(converted_columns[0], converted_columns[1], column_array.getOffsetsPtr()); + }; + } + + + WrapperType createMapWrapper(const DataTypePtr & from_type_untyped, const DataTypeMap * to_type) const + { + if (const auto * from_tuple = checkAndGetDataType(from_type_untyped.get())) + { + if (from_tuple->getElements().size() != 2) + throw Exception( + ErrorCodes::TYPE_MISMATCH, + "CAST AS Map from tuple requires 2 elements. " + "Left type: {}, right type: {}", + from_tuple->getName(), + to_type->getName()); + + DataTypes from_kv_types; + const auto & to_kv_types = to_type->getKeyValueTypes(); + + for (const auto & elem : from_tuple->getElements()) + { + const auto * type_array = checkAndGetDataType(elem.get()); + if (!type_array) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "CAST AS Map can only be performed from tuples of array. Got: {}", from_tuple->getName()); + + from_kv_types.push_back(type_array->getNestedType()); + } + + return createTupleToMapWrapper(from_kv_types, to_kv_types); + } + else if (const auto * from_array = typeid_cast(from_type_untyped.get())) + { + const auto * nested_tuple = typeid_cast(from_array->getNestedType().get()); + if (!nested_tuple || nested_tuple->getElements().size() != 2) + throw Exception( + ErrorCodes::TYPE_MISMATCH, + "CAST AS Map from array requires nested tuple of 2 elements. " + "Left type: {}, right type: {}", + from_array->getName(), + to_type->getName()); + + return createArrayToMapWrapper(nested_tuple->getElements(), to_type->getKeyValueTypes()); + } + else if (const auto * from_type = checkAndGetDataType(from_type_untyped.get())) + { + return createMapToMapWrapper(from_type->getKeyValueTypes(), to_type->getKeyValueTypes()); + } + else + { + throw Exception(ErrorCodes::TYPE_MISMATCH, "Unsupported types to CAST AS Map. " + "Left type: {}, right type: {}", from_type_untyped->getName(), to_type->getName()); + } + } + + WrapperType createTupleToObjectWrapper(const DataTypeTuple & from_tuple, bool has_nullable_subcolumns) const + { + if (!from_tuple.haveExplicitNames()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_tuple.getName()); + + PathsInData paths; + DataTypes from_types; + + std::tie(paths, from_types) = flattenTuple(from_tuple.getPtr()); + auto to_types = from_types; + + for (auto & type : to_types) + { + if (isTuple(type) || isNested(type)) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten Named Tuple. Got: {}", + from_tuple.getName()); + + type = recursiveRemoveLowCardinality(type); + } + + return [element_wrappers = getElementWrappers(from_types, to_types), + has_nullable_subcolumns, from_types, to_types, paths] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) + { + size_t tuple_size = to_types.size(); + auto flattened_column = flattenTuple(arguments.front().column); + const auto & column_tuple = assert_cast(*flattened_column); + + if (tuple_size != column_tuple.getColumns().size()) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Expected tuple with {} subcolumn, but got {} subcolumns", + tuple_size, column_tuple.getColumns().size()); + + auto res = ColumnObject::create(has_nullable_subcolumns); + for (size_t i = 0; i < tuple_size; ++i) + { + ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; + auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); + res->addSubcolumn(paths[i], converted_column->assumeMutable()); + } + + return res; + }; + } + + WrapperType createMapToObjectWrapper(const DataTypeMap & from_map, bool has_nullable_subcolumns) const + { + auto key_value_types = from_map.getKeyValueTypes(); + + if (!isStringOrFixedString(key_value_types[0])) + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object from Map can be performed only from Map " + "with String or FixedString key. Got: {}", from_map.getName()); + + const auto & value_type = key_value_types[1]; + auto to_value_type = value_type; + + if (!has_nullable_subcolumns && value_type->isNullable()) + to_value_type = removeNullable(value_type); + + if (has_nullable_subcolumns && !value_type->isNullable()) + to_value_type = makeNullable(value_type); + + DataTypes to_key_value_types{std::make_shared(), std::move(to_value_type)}; + auto element_wrappers = getElementWrappers(key_value_types, to_key_value_types); + + return [has_nullable_subcolumns, element_wrappers, key_value_types, to_key_value_types] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t) -> ColumnPtr + { + const auto & column_map = assert_cast(*arguments.front().column); + const auto & offsets = column_map.getNestedColumn().getOffsets(); + auto key_value_columns = column_map.getNestedData().getColumnsCopy(); + + for (size_t i = 0; i < 2; ++i) + { + ColumnsWithTypeAndName element{{key_value_columns[i], key_value_types[i], ""}}; + key_value_columns[i] = element_wrappers[i](element, to_key_value_types[i], nullable_source, key_value_columns[i]->size()); + } + + const auto & key_column_str = assert_cast(*key_value_columns[0]); + const auto & value_column = *key_value_columns[1]; + + using SubcolumnsMap = HashMap; + SubcolumnsMap subcolumns; + + for (size_t row = 0; row < offsets.size(); ++row) + { + for (size_t i = offsets[static_cast(row) - 1]; i < offsets[row]; ++i) + { + auto ref = key_column_str.getDataAt(i); + + bool inserted; + SubcolumnsMap::LookupResult it; + subcolumns.emplace(ref, it, inserted); + auto & subcolumn = it->getMapped(); + + if (inserted) + subcolumn = value_column.cloneEmpty()->cloneResized(row); + + /// Map can have duplicated keys. We insert only first one. + if (subcolumn->size() == row) + subcolumn->insertFrom(value_column, i); + } + + /// Insert default values for keys missed in current row. + for (const auto & [_, subcolumn] : subcolumns) + if (subcolumn->size() == row) + subcolumn->insertDefault(); + } + + auto column_object = ColumnObject::create(has_nullable_subcolumns); + for (auto && [key, subcolumn] : subcolumns) + { + PathInData path(key.toView()); + column_object->addSubcolumn(path, std::move(subcolumn)); + } + + return column_object; + }; + } + + WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const + { + if (const auto * from_tuple = checkAndGetDataType(from_type.get())) + { + return createTupleToObjectWrapper(*from_tuple, to_type->hasNullableSubcolumns()); + } + else if (const auto * from_map = checkAndGetDataType(from_type.get())) + { + return createMapToObjectWrapper(*from_map, to_type->hasNullableSubcolumns()); + } + else if (checkAndGetDataType(from_type.get())) + { + return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) + { + auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); + res->finalize(); + return res; + }; + } + else if (checkAndGetDataType(from_type.get())) + { + return [is_nullable = to_type->hasNullableSubcolumns()] (ColumnsWithTypeAndName & arguments, const DataTypePtr & , const ColumnNullable * , size_t) -> ColumnPtr + { + auto & column_object = assert_cast(*arguments.front().column); + auto res = ColumnObject::create(is_nullable); + for (size_t i = 0; i < column_object.size(); i++) + res->insert(column_object[i]); + + res->finalize(); + return res; + }; + } + + throw Exception(ErrorCodes::TYPE_MISMATCH, + "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); + } + + WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const + { + /// We support only extension of variant type, so, only new types can be added. + /// For example: Variant(T1, T2) -> Variant(T1, T2, T3) is supported, but Variant(T1, T2) -> Variant(T1, T3) is not supported. + /// We want to extend Variant type for free without rewriting the data, but we sort data types inside Variant during type creation + /// (we do it because we want Variant(T1, T2) to be the same as Variant(T2, T1)), but after extension the order of variant types + /// (and so their discriminators) can be different. For example: Variant(T1, T3) -> Variant(T1, T2, T3). + /// To avoid full rewrite of discriminators column, ColumnVariant supports it's local order of variant columns (and so local + /// discriminators) and stores mapping global order -> local order. + /// So, to extend Variant with new types for free, we should keep old local order for old variants, append new variants and change + /// mapping global order -> local order according to the new global order. + + /// Create map (new variant type) -> (it's global discriminator in new order). + const auto & new_variants = to_variant.getVariants(); + std::unordered_map new_variant_types_to_new_global_discriminator; + new_variant_types_to_new_global_discriminator.reserve(new_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + new_variant_types_to_new_global_discriminator[new_variants[i]->getName()] = i; + + /// Create set of old variant types. + const auto & old_variants = from_variant.getVariants(); + std::unordered_map old_variant_types_to_old_global_discriminator; + old_variant_types_to_old_global_discriminator.reserve(old_variants.size()); + for (size_t i = 0; i != old_variants.size(); ++i) + old_variant_types_to_old_global_discriminator[old_variants[i]->getName()] = i; + + /// Check that the set of old variants types is a subset of new variant types and collect new global discriminator for each old global discriminator. + std::unordered_map old_global_discriminator_to_new; + old_global_discriminator_to_new.reserve(old_variants.size()); + for (const auto & [old_variant_type, old_discriminator] : old_variant_types_to_old_global_discriminator) + { + auto it = new_variant_types_to_new_global_discriminator.find(old_variant_type); + if (it == new_variant_types_to_new_global_discriminator.end()) + throw Exception( + ErrorCodes::CANNOT_CONVERT_TYPE, + "Cannot convert type {} to {}. Conversion between Variant types is allowed only when new Variant type is an extension " + "of an initial one", from_variant.getName(), to_variant.getName()); + old_global_discriminator_to_new[old_discriminator] = it->second; + } + + /// Collect variant types and their global discriminators that should be added to the old Variant to get the new Variant. + std::vector> variant_types_and_discriminators_to_add; + variant_types_and_discriminators_to_add.reserve(new_variants.size() - old_variants.size()); + for (size_t i = 0; i != new_variants.size(); ++i) + { + if (!old_variant_types_to_old_global_discriminator.contains(new_variants[i]->getName())) + variant_types_and_discriminators_to_add.emplace_back(new_variants[i], i); + } + + return [old_global_discriminator_to_new, variant_types_and_discriminators_to_add] + (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + size_t num_old_variants = column_variant.getNumVariants(); + Columns new_variant_columns; + new_variant_columns.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + std::vector new_local_to_global_discriminators; + new_local_to_global_discriminators.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); + for (size_t i = 0; i != num_old_variants; ++i) + { + new_variant_columns.push_back(column_variant.getVariantPtrByLocalDiscriminator(i)); + new_local_to_global_discriminators.push_back(old_global_discriminator_to_new.at(column_variant.globalDiscriminatorByLocal(i))); + } + + for (const auto & [new_variant_type, new_global_discriminator] : variant_types_and_discriminators_to_add) + { + new_variant_columns.push_back(new_variant_type->createColumn()); + new_local_to_global_discriminators.push_back(new_global_discriminator); + } + + return ColumnVariant::create(column_variant.getLocalDiscriminatorsPtr(), column_variant.getOffsetsPtr(), new_variant_columns, new_local_to_global_discriminators); + }; + } + + WrapperType createVariantToColumnWrapper(const DataTypeVariant & from_variant, const DataTypePtr & to_type) const + { + const auto & variant_types = from_variant.getVariants(); + std::vector variant_wrappers; + variant_wrappers.reserve(variant_types.size()); + + /// Create conversion wrapper for each variant. + for (const auto & variant_type : variant_types) + variant_wrappers.push_back(prepareUnpackDictionaries(variant_type, to_type)); + + return [variant_wrappers, variant_types, to_type] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + const auto & column_variant = assert_cast(*arguments.front().column.get()); + + /// First, cast each variant to the result type. + std::vector casted_variant_columns; + casted_variant_columns.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); + ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; + const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; + casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); + } + + /// Second, construct resulting column from casted variant columns according to discriminators. + const auto & local_discriminators = column_variant.getLocalDiscriminators(); + auto res = result_type->createColumn(); + res->reserve(input_rows_count); + for (size_t i = 0; i != input_rows_count; ++i) + { + auto local_discr = local_discriminators[i]; + if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) + res->insertDefault(); + else + res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); + } + + return res; + }; + } + + static ColumnPtr createVariantFromDescriptorsAndOneNonEmptyVariant(const DataTypes & variant_types, const ColumnPtr & discriminators, const ColumnPtr & variant, ColumnVariant::Discriminator variant_discr) + { + Columns variants; + variants.reserve(variant_types.size()); + for (size_t i = 0; i != variant_types.size(); ++i) + { + if (i == variant_discr) + variants.emplace_back(variant); + else + variants.push_back(variant_types[i]->createColumn()); + } + + return ColumnVariant::create(discriminators, variants); + } + + WrapperType createColumnToVariantWrapper(const DataTypePtr & from_type, const DataTypeVariant & to_variant) const + { + /// We allow converting NULL to Variant(...) as Variant can store NULLs. + if (from_type->onlyNull()) + { + return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto result_column = result_type->createColumn(); + result_column->insertManyDefaults(input_rows_count); + return result_column; + }; + } + + auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(*removeNullableOrLowCardinalityNullable(from_type)); + if (!variant_discr_opt) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert type {} to {}. Conversion to Variant allowed only for types from this Variant", from_type->getName(), to_variant.getName()); + + return [variant_discr = *variant_discr_opt] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) -> ColumnPtr + { + const auto & result_variant_type = assert_cast(*result_type); + const auto & variant_types = result_variant_type.getVariants(); + if (const ColumnNullable * col_nullable = typeid_cast(arguments.front().column.get())) + { + const auto & column = col_nullable->getNestedColumnPtr(); + const auto & null_map = col_nullable->getNullMapData(); + IColumn::Filter filter; + filter.reserve(column->size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(column->size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != column->size(); ++i) + { + if (null_map[i]) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + ColumnPtr variant_column; + /// If there were no NULLs, just use the column. + if (variant_size_hint == column->size()) + variant_column = column; + /// Otherwise we should use filtered column. + else + variant_column = column->filter(filter, variant_size_hint); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), variant_column, variant_discr); + } + else if (isColumnLowCardinalityNullable(*arguments.front().column)) + { + const auto & column = arguments.front().column; + + /// Variant column cannot have LowCardinality(Nullable(...)) variant, as Variant column stores NULLs itself. + /// We should create a null-map, insert NULL_DISCRIMINATOR on NULL values and filter initial column. + const auto & col_lc = assert_cast(*column); + const auto & indexes = col_lc.getIndexes(); + auto null_index = col_lc.getDictionary().getNullValueIndex(); + IColumn::Filter filter; + filter.reserve(col_lc.size()); + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + auto & discriminators_data = discriminators->getData(); + discriminators_data.reserve(col_lc.size()); + size_t variant_size_hint = 0; + for (size_t i = 0; i != col_lc.size(); ++i) + { + if (indexes.getUInt(i) == null_index) + { + discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); + filter.push_back(0); + } + else + { + discriminators_data.push_back(variant_discr); + filter.push_back(1); + ++variant_size_hint; + } + } + + MutableColumnPtr variant_column; + /// If there were no NULLs, we can just clone the column. + if (variant_size_hint == col_lc.size()) + variant_column = IColumn::mutate(column); + /// Otherwise we should filter column. + else + variant_column = column->filter(filter, variant_size_hint)->assumeMutable(); + + assert_cast(*variant_column).nestedRemoveNullable(); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), std::move(variant_column), variant_discr); + } + else + { + const auto & column = arguments.front().column; + auto discriminators = ColumnVariant::ColumnDiscriminators::create(); + discriminators->getData().resize_fill(column->size(), variant_discr); + return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), column, variant_discr); + } + }; + } + + /// Wrapper for conversion to/from Variant type + WrapperType createVariantWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + if (const auto * from_variant = checkAndGetDataType(from_type.get())) + { + if (const auto * to_variant = checkAndGetDataType(to_type.get())) + return createVariantToVariantWrapper(*from_variant, *to_variant); + + return createVariantToColumnWrapper(*from_variant, to_type); + } + + return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); + } + + template + WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const + { + using EnumType = DataTypeEnum; + using Function = typename FunctionTo::Type; + + if (const auto * from_enum8 = checkAndGetDataType(from_type.get())) + checkEnumToEnumConversion(from_enum8, to_type); + else if (const auto * from_enum16 = checkAndGetDataType(from_type.get())) + checkEnumToEnumConversion(from_enum16, to_type); + + if (checkAndGetDataType(from_type.get())) + return createStringToEnumWrapper(); + else if (checkAndGetDataType(from_type.get())) + return createStringToEnumWrapper(); + else if (isNativeNumber(from_type) || isEnum(from_type)) + { + auto function = Function::create(); + return createFunctionAdaptor(function, from_type); + } + else + { + if (cast_type == CastType::accurateOrNull) + return createToNullableColumnWrapper(); + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", + from_type->getName(), to_type->getName()); + } + } + + template + void checkEnumToEnumConversion(const EnumTypeFrom * from_type, const EnumTypeTo * to_type) const + { + const auto & from_values = from_type->getValues(); + const auto & to_values = to_type->getValues(); + + using ValueType = std::common_type_t; + using NameValuePair = std::pair; + using EnumValues = std::vector; + + EnumValues name_intersection; + std::set_intersection(std::begin(from_values), std::end(from_values), + std::begin(to_values), std::end(to_values), std::back_inserter(name_intersection), + [] (auto && from, auto && to) { return from.first < to.first; }); + + for (const auto & name_value : name_intersection) + { + const auto & old_value = name_value.second; + const auto & new_value = to_type->getValue(name_value.first); + if (old_value != new_value) + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Enum conversion changes value for element '{}' from {} to {}", + name_value.first, toString(old_value), toString(new_value)); + } + } + + template + WrapperType createStringToEnumWrapper() const + { + const char * function_name = cast_name; + return [function_name] ( + ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, const ColumnNullable * nullable_col, size_t /*input_rows_count*/) + { + const auto & first_col = arguments.front().column.get(); + const auto & result_type = typeid_cast(*res_type); + + const ColumnStringType * col = typeid_cast(first_col); + + if (col && nullable_col && nullable_col->size() != col->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ColumnNullable is not compatible with original"); + + if (col) + { + const auto size = col->size(); + + auto res = result_type.createColumn(); + auto & out_data = static_cast(*res).getData(); + out_data.resize(size); + + auto default_enum_value = result_type.getValues().front().second; + + if (nullable_col) + { + for (size_t i = 0; i < size; ++i) + { + if (!nullable_col->isNullAt(i)) + out_data[i] = result_type.getValue(col->getDataAt(i)); + else + out_data[i] = default_enum_value; + } + } + else + { + for (size_t i = 0; i < size; ++i) + out_data[i] = result_type.getValue(col->getDataAt(i)); + } + + return res; + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column {} as first argument of function {}", + first_col->getName(), function_name); + }; + } + + template + WrapperType createEnumToStringWrapper() const + { + const char * function_name = cast_name; + return [function_name] ( + ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, const ColumnNullable * nullable_col, size_t /*input_rows_count*/) + { + using ColumnEnumType = EnumType::ColumnType; + + const auto & first_col = arguments.front().column.get(); + const auto & first_type = arguments.front().type.get(); + + const ColumnEnumType * enum_col = typeid_cast(first_col); + const EnumType * enum_type = typeid_cast(first_type); + + if (enum_col && nullable_col && nullable_col->size() != enum_col->size()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "ColumnNullable is not compatible with original"); + + if (enum_col && enum_type) + { + const auto size = enum_col->size(); + const auto & enum_data = enum_col->getData(); + + auto res = res_type->createColumn(); + + if (nullable_col) + { + for (size_t i = 0; i < size; ++i) + { + if (!nullable_col->isNullAt(i)) + { + const auto & value = enum_type->getNameForValue(enum_data[i]); + res->insertData(value.data, value.size); + } + else + res->insertDefault(); + } + } + else + { + for (size_t i = 0; i < size; ++i) + { + const auto & value = enum_type->getNameForValue(enum_data[i]); + res->insertData(value.data, value.size); + } + } + + return res; + } + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column {} as first argument of function {}", + first_col->getName(), function_name); + }; + } + + static WrapperType createIdentityWrapper(const DataTypePtr &) + { + return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) + { + return arguments.front().column; + }; + } + + static WrapperType createNothingWrapper(const IDataType * to_type) + { + ColumnPtr res = to_type->createColumnConstWithDefaultValue(1); + return [res] (ColumnsWithTypeAndName &, const DataTypePtr &, const ColumnNullable *, size_t input_rows_count) + { + /// Column of Nothing type is trivially convertible to any other column + return res->cloneResized(input_rows_count)->convertToFullColumnIfConst(); + }; + } + + WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const + { + /// Conversion from/to Variant data type is processed in a special way. + /// We don't need to remove LowCardinality/Nullable. + if (isVariant(to_type) || isVariant(from_type)) + return createVariantWrapper(from_type, to_type); + + const auto * from_low_cardinality = typeid_cast(from_type.get()); + const auto * to_low_cardinality = typeid_cast(to_type.get()); + const auto & from_nested = from_low_cardinality ? from_low_cardinality->getDictionaryType() : from_type; + const auto & to_nested = to_low_cardinality ? to_low_cardinality->getDictionaryType() : to_type; + + if (from_type->onlyNull()) + { + if (!to_nested->isNullable() && !isVariant(to_type)) + { + if (cast_type == CastType::accurateOrNull) + { + return createToNullableColumnWrapper(); + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert NULL to a non-nullable type"); + } + } + + return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) + { + return result_type->createColumnConstWithDefaultValue(input_rows_count)->convertToFullColumnIfConst(); + }; + } + + bool skip_not_null_check = false; + + if (from_low_cardinality && from_nested->isNullable() && !to_nested->isNullable()) + /// Disable check for dictionary. Will check that column doesn't contain NULL in wrapper below. + skip_not_null_check = true; + + auto wrapper = prepareRemoveNullable(from_nested, to_nested, skip_not_null_check); + if (!from_low_cardinality && !to_low_cardinality) + return wrapper; + + return [wrapper, from_low_cardinality, to_low_cardinality, skip_not_null_check] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) -> ColumnPtr + { + ColumnsWithTypeAndName args = {arguments[0]}; + auto & arg = args.front(); + auto res_type = result_type; + + ColumnPtr converted_column; + + ColumnPtr res_indexes; + /// For some types default can't be casted (for example, String to Int). In that case convert column to full. + bool src_converted_to_full_column = false; + + { + auto tmp_rows_count = input_rows_count; + + if (to_low_cardinality) + res_type = to_low_cardinality->getDictionaryType(); + + if (from_low_cardinality) + { + const auto * col_low_cardinality = typeid_cast(arguments[0].column.get()); + + if (skip_not_null_check && col_low_cardinality->containsNull()) + throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type"); + + arg.column = col_low_cardinality->getDictionary().getNestedColumn(); + arg.type = from_low_cardinality->getDictionaryType(); + + /// TODO: Make map with defaults conversion. + src_converted_to_full_column = !removeNullable(arg.type)->equals(*removeNullable(res_type)); + if (src_converted_to_full_column) + arg.column = arg.column->index(col_low_cardinality->getIndexes(), 0); + else + res_indexes = col_low_cardinality->getIndexesPtr(); + + tmp_rows_count = arg.column->size(); + } + + /// Perform the requested conversion. + converted_column = wrapper(args, res_type, nullable_source, tmp_rows_count); + } + + if (to_low_cardinality) + { + auto res_column = to_low_cardinality->createColumn(); + auto * col_low_cardinality = typeid_cast(res_column.get()); + + if (from_low_cardinality && !src_converted_to_full_column) + { + col_low_cardinality->insertRangeFromDictionaryEncodedColumn(*converted_column, *res_indexes); + } + else + col_low_cardinality->insertRangeFromFullColumn(*converted_column, 0, converted_column->size()); + + return res_column; + } + else if (!src_converted_to_full_column) + return converted_column->index(*res_indexes, 0); + else + return converted_column; + }; + } + + WrapperType prepareRemoveNullable(const DataTypePtr & from_type, const DataTypePtr & to_type, bool skip_not_null_check) const + { + /// Determine whether pre-processing and/or post-processing must take place during conversion. + + bool source_is_nullable = from_type->isNullable(); + bool result_is_nullable = to_type->isNullable(); + + auto wrapper = prepareImpl(removeNullable(from_type), removeNullable(to_type), result_is_nullable); + + if (result_is_nullable) + { + return [wrapper, source_is_nullable] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + /// Create a temporary columns on which to perform the operation. + const auto & nullable_type = static_cast(*result_type); + const auto & nested_type = nullable_type.getNestedType(); + + ColumnsWithTypeAndName tmp_args; + if (source_is_nullable) + tmp_args = createBlockWithNestedColumns(arguments); + else + tmp_args = arguments; + + const ColumnNullable * nullable_source = nullptr; + + /// Add original ColumnNullable for createStringToEnumWrapper() + if (source_is_nullable) + { + if (arguments.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid number of arguments"); + nullable_source = typeid_cast(arguments.front().column.get()); + } + + /// Perform the requested conversion. + auto tmp_res = wrapper(tmp_args, nested_type, nullable_source, input_rows_count); + + /// May happen in fuzzy tests. For debug purpose. + if (!tmp_res) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Couldn't convert {} to {} in prepareRemoveNullable wrapper.", + arguments[0].type->getName(), nested_type->getName()); + + return wrapInNullable(tmp_res, arguments, nested_type, input_rows_count); + }; + } + else if (source_is_nullable) + { + /// Conversion from Nullable to non-Nullable. + + return [wrapper, skip_not_null_check] + (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + auto tmp_args = createBlockWithNestedColumns(arguments); + auto nested_type = removeNullable(result_type); + + /// Check that all values are not-NULL. + /// Check can be skipped in case if LowCardinality dictionary is transformed. + /// In that case, correctness will be checked beforehand. + if (!skip_not_null_check) + { + const auto & col = arguments[0].column; + const auto & nullable_col = assert_cast(*col); + const auto & null_map = nullable_col.getNullMapData(); + + if (!memoryIsZero(null_map.data(), 0, null_map.size())) + throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type"); + } + const ColumnNullable * nullable_source = typeid_cast(arguments.front().column.get()); + return wrapper(tmp_args, nested_type, nullable_source, input_rows_count); + }; + } + else + return wrapper; + } + + /// 'from_type' and 'to_type' are nested types in case of Nullable. + /// 'requested_result_is_nullable' is true if CAST to Nullable type is requested. + WrapperType prepareImpl(const DataTypePtr & from_type, const DataTypePtr & to_type, bool requested_result_is_nullable) const + { + if (isUInt8(from_type) && isBool(to_type)) + return createUInt8ToBoolWrapper(from_type, to_type); + + /// We can cast IPv6 into IPv6, IPv4 into IPv4, but we should not allow to cast FixedString(16) into IPv6 as part of identity cast + bool safe_convert_custom_types = true; + + if (const auto * to_type_custom_name = to_type->getCustomName()) + safe_convert_custom_types = from_type->getCustomName() && from_type->getCustomName()->getName() == to_type_custom_name->getName(); + else if (const auto * from_type_custom_name = from_type->getCustomName()) + safe_convert_custom_types = to_type->getCustomName() && from_type_custom_name->getName() == to_type->getCustomName()->getName(); + + if (from_type->equals(*to_type) && safe_convert_custom_types) + { + /// We can only use identity conversion for DataTypeAggregateFunction when they are strictly equivalent. + if (typeid_cast(from_type.get())) + { + if (DataTypeAggregateFunction::strictEquals(from_type, to_type)) + return createIdentityWrapper(from_type); + } + else + return createIdentityWrapper(from_type); + } + else if (WhichDataType(from_type).isNothing()) + return createNothingWrapper(to_type.get()); + + WrapperType ret; + + auto make_default_wrapper = [&](const auto & types) -> bool + { + using Types = std::decay_t; + using ToDataType = typename Types::LeftType; + + if constexpr ( + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v || + std::is_same_v) + { + ret = createWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); + return true; + } + if constexpr (std::is_same_v) + { + if (isBool(to_type)) + ret = createBoolWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); + else + ret = createWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); + return true; + } + if constexpr ( + std::is_same_v || + std::is_same_v) + { + ret = createEnumWrapper(from_type, checkAndGetDataType(to_type.get())); + return true; + } + if constexpr ( + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v> || + std::is_same_v) + { + ret = createDecimalWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); + return true; + } + + return false; + }; + + bool cast_ipv4_ipv6_default_on_conversion_error_value = context && context->getSettingsRef().cast_ipv4_ipv6_default_on_conversion_error; + bool input_format_ipv4_default_on_conversion_error_value = context && context->getSettingsRef().input_format_ipv4_default_on_conversion_error; + bool input_format_ipv6_default_on_conversion_error_value = context && context->getSettingsRef().input_format_ipv6_default_on_conversion_error; + + auto make_custom_serialization_wrapper = [&, cast_ipv4_ipv6_default_on_conversion_error_value, input_format_ipv4_default_on_conversion_error_value, input_format_ipv6_default_on_conversion_error_value](const auto & types) -> bool + { + using Types = std::decay_t; + using ToDataType = typename Types::RightType; + using FromDataType = typename Types::LeftType; + + if constexpr (WhichDataType(FromDataType::type_id).isStringOrFixedString()) + { + if constexpr (std::is_same_v) + { + ret = [cast_ipv4_ipv6_default_on_conversion_error_value, + input_format_ipv4_default_on_conversion_error_value, + requested_result_is_nullable]( + ColumnsWithTypeAndName & arguments, + const DataTypePtr & result_type, + const ColumnNullable * column_nullable, + size_t) -> ColumnPtr + { + if (!WhichDataType(result_type).isIPv4()) + throw Exception(ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv4", result_type->getName()); + + const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; + if (requested_result_is_nullable) + return convertToIPv4(arguments[0].column, null_map); + else if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv4_default_on_conversion_error_value) + return convertToIPv4(arguments[0].column, null_map); + else + return convertToIPv4(arguments[0].column, null_map); + }; + + return true; + } + + if constexpr (std::is_same_v) + { + ret = [cast_ipv4_ipv6_default_on_conversion_error_value, + input_format_ipv6_default_on_conversion_error_value, + requested_result_is_nullable]( + ColumnsWithTypeAndName & arguments, + const DataTypePtr & result_type, + const ColumnNullable * column_nullable, + size_t) -> ColumnPtr + { + if (!WhichDataType(result_type).isIPv6()) + throw Exception( + ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv6", result_type->getName()); + + const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; + if (requested_result_is_nullable) + return convertToIPv6(arguments[0].column, null_map); + else if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv6_default_on_conversion_error_value) + return convertToIPv6(arguments[0].column, null_map); + else + return convertToIPv6(arguments[0].column, null_map); + }; + + return true; + } + + if (to_type->getCustomSerialization() && to_type->getCustomName()) + { + ret = [requested_result_is_nullable]( + ColumnsWithTypeAndName & arguments, + const DataTypePtr & result_type, + const ColumnNullable * column_nullable, + size_t input_rows_count) -> ColumnPtr + { + auto wrapped_result_type = result_type; + if (requested_result_is_nullable) + wrapped_result_type = makeNullable(result_type); + return ConvertImplGenericFromString::execute( + arguments, wrapped_result_type, column_nullable, input_rows_count); + }; + return true; + } + } + else if constexpr (WhichDataType(FromDataType::type_id).isIPv6() && WhichDataType(ToDataType::type_id).isIPv4()) + { + ret = [cast_ipv4_ipv6_default_on_conversion_error_value, requested_result_is_nullable]( + ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t) + -> ColumnPtr + { + if (!WhichDataType(result_type).isIPv4()) + throw Exception( + ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv4", result_type->getName()); + + const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; + if (requested_result_is_nullable) + return convertIPv6ToIPv4(arguments[0].column, null_map); + else if (cast_ipv4_ipv6_default_on_conversion_error_value) + return convertIPv6ToIPv4(arguments[0].column, null_map); + else + return convertIPv6ToIPv4(arguments[0].column, null_map); + }; + + return true; + } + + if constexpr (WhichDataType(ToDataType::type_id).isStringOrFixedString()) + { + if constexpr (WhichDataType(FromDataType::type_id).isEnum()) + { + ret = createEnumToStringWrapper(); + return true; + } + else if (from_type->getCustomSerialization()) + { + ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr + { + return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); + }; + return true; + } + } + + return false; + }; + + if (callOnTwoTypeIndexes(from_type->getTypeId(), to_type->getTypeId(), make_custom_serialization_wrapper)) + return ret; + + if (callOnIndexAndDataType(to_type->getTypeId(), make_default_wrapper)) + return ret; + + switch (to_type->getTypeId()) + { + case TypeIndex::String: + return createStringWrapper(from_type); + case TypeIndex::FixedString: + return createFixedStringWrapper(from_type, checkAndGetDataType(to_type.get())->getN()); + case TypeIndex::Array: + return createArrayWrapper(from_type, static_cast(*to_type)); + case TypeIndex::Tuple: + return createTupleWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::Map: + return createMapWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::Object: + return createObjectWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::AggregateFunction: + return createAggregateFunctionWrapper(from_type, checkAndGetDataType(to_type.get())); + case TypeIndex::Interval: + return createIntervalWrapper(from_type, checkAndGetDataType(to_type.get())->getKind()); + default: + break; + } + + if (cast_type == CastType::accurateOrNull) + return createToNullableColumnWrapper(); + else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", + from_type->getName(), to_type->getName()); + } +}; + +class MonotonicityHelper +{ +public: + using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; + + template + static auto monotonicityForType(const DataType * const) + { + return FunctionTo::Type::Monotonic::get; + } + + static MonotonicityForRange getMonotonicityInformation(const DataTypePtr & from_type, const IDataType * to_type) + { + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (isEnum(from_type)) + { + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + if (const auto * type = checkAndGetDataType(to_type)) + return monotonicityForType(type); + } + /// other types like Null, FixedString, Array and Tuple have no monotonicity defined + return {}; + } +}; + + FunctionBasePtr createFunctionBaseCast( ContextPtr context , const ColumnsWithTypeAndName & arguments @@ -146,54 +5102,4 @@ REGISTER_FUNCTION(Conversion) factory.registerFunction>(); } - -MonotonicityHelper::MonotonicityForRange MonotonicityHelper::getMonotonicityInformation(const DataTypePtr & from_type, const IDataType * to_type) -{ - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (isEnum(from_type)) - { - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - if (const auto * type = checkAndGetDataType(to_type)) - return monotonicityForType(type); - } - /// other types like Null, FixedString, Array and Tuple have no monotonicity defined - return {}; -} - } diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h deleted file mode 100644 index c21e85fb40e..00000000000 --- a/src/Functions/FunctionsConversion.h +++ /dev/null @@ -1,4924 +0,0 @@ -#pragma once - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - - -namespace DB -{ - -namespace ErrorCodes -{ - extern const int ATTEMPT_TO_READ_AFTER_EOF; - extern const int CANNOT_PARSE_NUMBER; - extern const int CANNOT_READ_ARRAY_FROM_TEXT; - extern const int CANNOT_PARSE_INPUT_ASSERTION_FAILED; - extern const int CANNOT_PARSE_QUOTED_STRING; - extern const int CANNOT_PARSE_ESCAPE_SEQUENCE; - extern const int CANNOT_PARSE_DATE; - extern const int CANNOT_PARSE_DATETIME; - extern const int CANNOT_PARSE_TEXT; - extern const int CANNOT_PARSE_UUID; - extern const int CANNOT_PARSE_IPV4; - extern const int CANNOT_PARSE_IPV6; - extern const int TOO_FEW_ARGUMENTS_FOR_FUNCTION; - extern const int LOGICAL_ERROR; - extern const int TYPE_MISMATCH; - extern const int CANNOT_CONVERT_TYPE; - extern const int ILLEGAL_COLUMN; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; - extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int NOT_IMPLEMENTED; - extern const int CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN; - extern const int CANNOT_PARSE_BOOL; - extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE; -} - -/** Type conversion functions. - * toType - conversion in "natural way"; - */ - -UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column); - -/// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. -struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; - -struct AccurateConvertStrategyAdditions -{ - UInt32 scale { 0 }; -}; - -struct AccurateOrNullConvertStrategyAdditions -{ - UInt32 scale { 0 }; -}; - - -struct ConvertDefaultBehaviorTag {}; -struct ConvertReturnNullOnErrorTag {}; -struct ConvertReturnZeroOnErrorTag {}; - -/** Conversion of number types to each other, enums to numbers, dates and datetimes to numbers and back: done by straight assignment. - * (Date is represented internally as number of days from some day; DateTime - as unix timestamp) - */ -template -struct ConvertImpl -{ - using FromFieldType = typename FromDataType::FieldType; - using ToFieldType = typename ToDataType::FieldType; - - template - static ColumnPtr NO_SANITIZE_UNDEFINED execute( - const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type [[maybe_unused]], size_t input_rows_count, - Additions additions [[maybe_unused]] = Additions()) - { - const ColumnWithTypeAndName & named_from = arguments[0]; - - using ColVecFrom = typename FromDataType::ColumnType; - using ColVecTo = typename ToDataType::ColumnType; - - if constexpr ((IsDataTypeDecimal || IsDataTypeDecimal) - && !(std::is_same_v || std::is_same_v)) - { - if constexpr (!IsDataTypeDecimalOrNumber || !IsDataTypeDecimalOrNumber) - { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - named_from.column->getName(), Name::name); - } - } - - if (const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get())) - { - typename ColVecTo::MutablePtr col_to = nullptr; - - if constexpr (IsDataTypeDecimal) - { - UInt32 scale; - - if constexpr (std::is_same_v - || std::is_same_v) - { - scale = additions.scale; - } - else - { - scale = additions; - } - - col_to = ColVecTo::create(0, scale); - } - else - col_to = ColVecTo::create(); - - const auto & vec_from = col_from->getData(); - auto & vec_to = col_to->getData(); - vec_to.resize(input_rows_count); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; - if constexpr (std::is_same_v) - { - col_null_map_to = ColumnUInt8::create(input_rows_count, false); - vec_null_map_to = &col_null_map_to->getData(); - } - - bool result_is_bool = isBool(result_type); - for (size_t i = 0; i < input_rows_count; ++i) - { - if constexpr (std::is_same_v) - { - if (result_is_bool) - { - vec_to[i] = vec_from[i] != FromFieldType(0); - continue; - } - } - - if constexpr (std::is_same_v && std::is_same_v) - { - static_assert( - std::is_same_v, - "UInt128 and UUID types must be same"); - - vec_to[i].items[1] = vec_from[i].toUnderType().items[0]; - vec_to[i].items[0] = vec_from[i].toUnderType().items[1]; - - continue; - } - - if constexpr (std::is_same_v && std::is_same_v) - { - static_assert( - std::is_same_v, - "UInt128 and IPv6 types must be same"); - - vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]); - vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]); - - continue; - } - - if constexpr (std::is_same_v != std::is_same_v) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Conversion between numeric types and UUID is not supported. " - "Probably the passed UUID is unquoted"); - } - else if constexpr ( - (std::is_same_v != std::is_same_v) - && !(is_any_of || is_any_of) - ) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from {} to {} is not supported", - TypeName, TypeName); - } - else if constexpr (std::is_same_v != std::is_same_v && !(std::is_same_v || std::is_same_v)) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Conversion between numeric types and IPv6 is not supported. " - "Probably the passed IPv6 is unquoted"); - } - else - { - if constexpr (IsDataTypeDecimal || IsDataTypeDecimal) - { - if constexpr (std::is_same_v) - { - ToFieldType result; - bool convert_result = false; - - if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) - convert_result = tryConvertDecimals(vec_from[i], col_from->getScale(), col_to->getScale(), result); - else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) - convert_result = tryConvertFromDecimal(vec_from[i], col_from->getScale(), result); - else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) - convert_result = tryConvertToDecimal(vec_from[i], col_to->getScale(), result); - - if (convert_result) - vec_to[i] = result; - else - { - vec_to[i] = static_cast(0); - (*vec_null_map_to)[i] = true; - } - } - else - { - if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) - vec_to[i] = convertDecimals(vec_from[i], col_from->getScale(), col_to->getScale()); - else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) - vec_to[i] = convertFromDecimal(vec_from[i], col_from->getScale()); - else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) - vec_to[i] = convertToDecimal(vec_from[i], col_to->getScale()); - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unsupported data type in conversion function"); - } - } - else - { - /// If From Data is Nan or Inf and we convert to integer type, throw exception - if constexpr (std::is_floating_point_v && !std::is_floating_point_v) - { - if (!isFinite(vec_from[i])) - { - if constexpr (std::is_same_v) - { - vec_to[i] = 0; - (*vec_null_map_to)[i] = true; - continue; - } - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unexpected inf or nan to integer conversion"); - } - } - - if constexpr (std::is_same_v - || std::is_same_v) - { - bool convert_result = accurate::convertNumeric(vec_from[i], vec_to[i]); - - if (!convert_result) - { - if (std::is_same_v) - { - vec_to[i] = 0; - (*vec_null_map_to)[i] = true; - } - else - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", - named_from.column->getName(), result_type->getName()); - } - } - } - else - { - if constexpr (std::is_same_v && std::is_same_v) - { - const uint8_t ip4_cidr[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; - const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); - if (!matchIPv6Subnet(src, ip4_cidr, 96)) - { - char addr[IPV6_MAX_TEXT_LENGTH + 1] {}; - char * paddr = addr; - formatIPv6(src, paddr); - - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "IPv6 {} in column {} is not in IPv4 mapping block", addr, named_from.column->getName()); - } - - uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); - if constexpr (std::endian::native == std::endian::little) - { - dst[0] = src[15]; - dst[1] = src[14]; - dst[2] = src[13]; - dst[3] = src[12]; - } - else - { - dst[0] = src[12]; - dst[1] = src[13]; - dst[2] = src[14]; - dst[3] = src[15]; - } - } - else if constexpr (std::is_same_v && std::is_same_v) - { - const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); - uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); - std::memset(dst, '\0', IPV6_BINARY_LENGTH); - dst[10] = dst[11] = 0xff; - - if constexpr (std::endian::native == std::endian::little) - { - dst[12] = src[3]; - dst[13] = src[2]; - dst[14] = src[1]; - dst[15] = src[0]; - } - else - { - dst[12] = src[0]; - dst[13] = src[1]; - dst[14] = src[2]; - dst[15] = src[3]; - } - } - else if constexpr (std::is_same_v && std::is_same_v) - vec_to[i] = static_cast(static_cast(vec_from[i])); - else if constexpr (std::is_same_v && (std::is_same_v || std::is_same_v)) - vec_to[i] = static_cast(vec_from[i] * DATE_SECONDS_PER_DAY); - else - vec_to[i] = static_cast(vec_from[i]); - } - } - } - } - - if constexpr (std::is_same_v) - return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); - else - return col_to; - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - named_from.column->getName(), Name::name); - } -}; - -/** Conversion of DateTime to Date: throw off time component. - */ -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -/** Conversion of DateTime to Date32: throw off time component. - */ -template -struct ConvertImpl - : DateTimeTransformImpl {}; - -/** Conversion of Date to DateTime: adding 00:00:00 time component. - */ -template -struct ToDateTimeImpl -{ - static constexpr auto name = "toDateTime"; - - static UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (d > MAX_DATETIME_DAY_NUM) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Day number {} is out of bounds of type DateTime", d); - } - else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) - { - if (d > MAX_DATETIME_DAY_NUM) - d = MAX_DATETIME_DAY_NUM; - } - return static_cast(time_zone.fromDayNum(DayNum(d))); - } - - static UInt32 execute(Int32 d, const DateLUTImpl & time_zone) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) - { - if (d < 0) - return 0; - else if (d > MAX_DATETIME_DAY_NUM) - d = MAX_DATETIME_DAY_NUM; - } - else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (d < 0 || d > MAX_DATETIME_DAY_NUM) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", d); - } - return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))); - } - - static UInt32 execute(UInt32 dt, const DateLUTImpl & /*time_zone*/) - { - return dt; - } - - static UInt32 execute(Int64 dt64, const DateLUTImpl & /*time_zone*/) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Ignore) - return static_cast(dt64); - else - { - if (dt64 < 0 || dt64 >= MAX_DATETIME_TIMESTAMP) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) - return dt64 < 0 ? 0 : std::numeric_limits::max(); - else - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", dt64); - } - else - return static_cast(dt64); - } - } -}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -/// Implementation of toDate function. - -template -struct ToDateTransform32Or64 -{ - static constexpr auto name = "toDate"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from > MAX_DATETIME_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type Date", from); - } - /// if value is smaller (or equal) than maximum day value for Date, than treat it as day num, - /// otherwise treat it as unix timestamp. This is a bit weird, but we leave this behavior. - if (from <= DATE_LUT_MAX_DAY_NUM) - return from; - else - return time_zone.toDayNum(std::min(time_t(from), time_t(MAX_DATETIME_TIMESTAMP))); - } -}; - -/** Conversion of Date32 to Date. - */ -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ToDateTransform32Or64Signed -{ - static constexpr auto name = "toDate"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) - { - // TODO: decide narrow or extended range based on FromType - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from < 0 || from > MAX_DATE_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type Date", from); - } - else - { - if (from < 0) - return 0; - } - return (from <= DATE_LUT_MAX_DAY_NUM) - ? static_cast(from) - : time_zone.toDayNum(std::min(time_t(from), time_t(MAX_DATE_TIMESTAMP))); - } -}; - -template -struct ToDateTransform8Or16Signed -{ - static constexpr auto name = "toDate"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) - { - if (from < 0) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type Date", from); - else - return 0; - } - return from; - } -}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -/// Implementation of toDate32 function. - -template -struct ToDate32Transform32Or64 -{ - static constexpr auto name = "toDate32"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) - { - if (from < DATE_LUT_MAX_EXTEND_DAY_NUM) - return static_cast(from); - else - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type Date32", from); - } - return time_zone.toDayNum(std::min(time_t(from), time_t(MAX_DATETIME64_TIMESTAMP))); - } - } -}; - -template -struct ToDate32Transform32Or64Signed -{ - static constexpr auto name = "toDate32"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl & time_zone) - { - static const Int32 daynum_min_offset = -static_cast(time_zone.getDayNumOffsetEpoch()); - - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from < daynum_min_offset || from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type Date32", from); - } - - if (from < daynum_min_offset) - return daynum_min_offset; - - return (from < DATE_LUT_MAX_EXTEND_DAY_NUM) - ? static_cast(from) - : time_zone.toDayNum(std::min(time_t(Int64(from)), time_t(MAX_DATETIME64_TIMESTAMP))); - } -}; - -template -struct ToDate32Transform8Or16Signed -{ - static constexpr auto name = "toDate32"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) - { - return from; - } -}; - -/** Special case of converting Int8, Int16, (U)Int32 or (U)Int64 (and also, for convenience, - * Float32, Float64) to Date. If the - * number is less than 65536, then it is treated as DayNum, and if it's greater or equals to 65536, - * then treated as unix timestamp. If the number exceeds UInt32, saturate to MAX_UINT32 then as DayNum. - * It's a bit illogical, as we actually have two functions in one. - * But allows to support frequent case, - * when user write toDate(UInt32), expecting conversion of unix timestamp to Date. - * (otherwise such usage would be frequent mistake). - */ -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - - -template -struct ToDateTimeTransform64 -{ - static constexpr auto name = "toDateTime"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from > MAX_DATETIME_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime", from); - } - return static_cast(std::min(time_t(from), time_t(MAX_DATETIME_TIMESTAMP))); - } -}; - -template -struct ToDateTimeTransformSigned -{ - static constexpr auto name = "toDateTime"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) - { - if (from < 0) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime", from); - else - return 0; - } - return from; - } -}; - -template -struct ToDateTimeTransform64Signed -{ - static constexpr auto name = "toDateTime"; - - static NO_SANITIZE_UNDEFINED ToType execute(const FromType & from, const DateLUTImpl &) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from < 0 || from > MAX_DATETIME_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime", from); - } - - if (from < 0) - return 0; - return static_cast(std::min(time_t(from), time_t(MAX_DATETIME_TIMESTAMP))); - } -}; - -/// Special case of converting Int8, Int16, Int32 or (U)Int64 (and also, for convenience, Float32, Float64) to DateTime. -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -/** Conversion of numeric to DateTime64 - */ - -template -struct ToDateTime64TransformUnsigned -{ - static constexpr auto name = "toDateTime64"; - - const DateTime64::NativeType scale_multiplier = 1; - - ToDateTime64TransformUnsigned(UInt32 scale = 0) /// NOLINT - : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) - {} - - NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime64", from); - else - return DecimalUtils::decimalFromComponentsWithMultiplier(from, 0, scale_multiplier); - } - else - return DecimalUtils::decimalFromComponentsWithMultiplier(std::min(from, MAX_DATETIME64_TIMESTAMP), 0, scale_multiplier); - } -}; -template -struct ToDateTime64TransformSigned -{ - static constexpr auto name = "toDateTime64"; - - const DateTime64::NativeType scale_multiplier = 1; - - ToDateTime64TransformSigned(UInt32 scale = 0) /// NOLINT - : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) - {} - - NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from < MIN_DATETIME64_TIMESTAMP || from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime64", from); - } - from = static_cast(std::max(from, MIN_DATETIME64_TIMESTAMP)); - from = static_cast(std::min(from, MAX_DATETIME64_TIMESTAMP)); - - return DecimalUtils::decimalFromComponentsWithMultiplier(from, 0, scale_multiplier); - } -}; -template -struct ToDateTime64TransformFloat -{ - static constexpr auto name = "toDateTime64"; - - const UInt32 scale = 1; - - ToDateTime64TransformFloat(UInt32 scale_ = 0) /// NOLINT - : scale(scale_) - {} - - NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (from < MIN_DATETIME64_TIMESTAMP || from > MAX_DATETIME64_TIMESTAMP) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Timestamp value {} is out of bounds of type DateTime64", from); - } - - from = std::max(from, static_cast(MIN_DATETIME64_TIMESTAMP)); - from = std::min(from, static_cast(MAX_DATETIME64_TIMESTAMP)); - return convertToDecimal(from, scale); - } -}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - - -/** Conversion of DateTime64 to Date or DateTime: discards fractional part. - */ -template -struct FromDateTime64Transform -{ - static constexpr auto name = Transform::name; - - const DateTime64::NativeType scale_multiplier = 1; - - FromDateTime64Transform(UInt32 scale) /// NOLINT - : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) - {} - - auto execute(DateTime64::NativeType dt, const DateLUTImpl & time_zone) const - { - const auto c = DecimalUtils::splitWithScaleMultiplier(DateTime64(dt), scale_multiplier); - return Transform::execute(static_cast(c.whole), time_zone); - } -}; - -/** Conversion of DateTime64 to Date or DateTime: discards fractional part. - */ -template -struct ConvertImpl - : DateTimeTransformImpl>, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl>, false> {}; - -struct ToDateTime64Transform -{ - static constexpr auto name = "toDateTime64"; - - const DateTime64::NativeType scale_multiplier = 1; - - ToDateTime64Transform(UInt32 scale = 0) /// NOLINT - : scale_multiplier(DecimalUtils::scaleMultiplier(scale)) - {} - - DateTime64::NativeType execute(UInt16 d, const DateLUTImpl & time_zone) const - { - const auto dt = ToDateTimeImpl<>::execute(d, time_zone); - return execute(dt, time_zone); - } - - DateTime64::NativeType execute(Int32 d, const DateLUTImpl & time_zone) const - { - Int64 dt = static_cast(time_zone.fromDayNum(ExtendedDayNum(d))); - return DecimalUtils::decimalFromComponentsWithMultiplier(dt, 0, scale_multiplier); - } - - DateTime64::NativeType execute(UInt32 dt, const DateLUTImpl & /*time_zone*/) const - { - return DecimalUtils::decimalFromComponentsWithMultiplier(dt, 0, scale_multiplier); - } -}; - -/** Conversion of Date or DateTime to DateTime64: add zero sub-second part. - */ -template -struct ConvertImpl - : DateTimeTransformImpl {}; - -template -struct ConvertImpl - : DateTimeTransformImpl {}; - -template -struct ConvertImpl - : DateTimeTransformImpl {}; - - -/** Transformation of numbers, dates, datetimes to strings: through formatting. - */ -template -struct FormatImpl -{ - template - static ReturnType execute(const typename DataType::FieldType x, WriteBuffer & wb, const DataType *, const DateLUTImpl *) - { - writeText(x, wb); - return ReturnType(true); - } -}; - -template <> -struct FormatImpl -{ - template - static ReturnType execute(const DataTypeDate::FieldType x, WriteBuffer & wb, const DataTypeDate *, const DateLUTImpl * time_zone) - { - writeDateText(DayNum(x), wb, *time_zone); - return ReturnType(true); - } -}; - -template <> -struct FormatImpl -{ - template - static ReturnType execute(const DataTypeDate32::FieldType x, WriteBuffer & wb, const DataTypeDate32 *, const DateLUTImpl * time_zone) - { - writeDateText(ExtendedDayNum(x), wb, *time_zone); - return ReturnType(true); - } -}; - -template <> -struct FormatImpl -{ - template - static ReturnType execute(const DataTypeDateTime::FieldType x, WriteBuffer & wb, const DataTypeDateTime *, const DateLUTImpl * time_zone) - { - writeDateTimeText(x, wb, *time_zone); - return ReturnType(true); - } -}; - -template <> -struct FormatImpl -{ - template - static ReturnType execute(const DataTypeDateTime64::FieldType x, WriteBuffer & wb, const DataTypeDateTime64 * type, const DateLUTImpl * time_zone) - { - writeDateTimeText(DateTime64(x), type->getScale(), wb, *time_zone); - return ReturnType(true); - } -}; - - -template -struct FormatImpl> -{ - template - static ReturnType execute(const FieldType x, WriteBuffer & wb, const DataTypeEnum * type, const DateLUTImpl *) - { - static constexpr bool throw_exception = std::is_same_v; - - if constexpr (throw_exception) - { - writeString(type->getNameForValue(x), wb); - } - else - { - StringRef res; - bool is_ok = type->getNameForValue(x, res); - if (is_ok) - writeString(res, wb); - return ReturnType(is_ok); - } - } -}; - -template -struct FormatImpl> -{ - template - static ReturnType execute(const FieldType x, WriteBuffer & wb, const DataTypeDecimal * type, const DateLUTImpl *) - { - writeText(x, type->getScale(), wb, false); - return ReturnType(true); - } -}; - - -/// DataTypeEnum to DataType free conversion -template -struct ConvertImpl, DataTypeNumber, Name, ConvertDefaultBehaviorTag> -{ - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) - { - return arguments[0].column; - } -}; - -static inline ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) -{ - ColumnUInt8::MutablePtr null_map = nullptr; - if (const auto * col_null = checkAndGetColumn(col.get())) - { - null_map = ColumnUInt8::create(); - null_map->insertRangeFrom(col_null->getNullMapColumn(), 0, col_null->size()); - } - return null_map; -} - -template -requires (!std::is_same_v) -struct ConvertImpl -{ - using FromFieldType = typename FromDataType::FieldType; - using ColVecType = ColumnVectorOrDecimal; - - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) - { - if constexpr (IsDataTypeDateOrDateTime) - { - auto datetime_arg = arguments[0]; - - const DateLUTImpl * time_zone = nullptr; - const ColumnConst * time_zone_column = nullptr; - - if (arguments.size() == 1) - { - auto non_null_args = createBlockWithNestedColumns(arguments); - time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0); - } - else /// When we have a column for timezone - { - datetime_arg.column = datetime_arg.column->convertToFullColumnIfConst(); - - if constexpr (std::is_same_v || std::is_same_v) - time_zone = &DateLUT::instance(); - /// For argument of Date or DateTime type, second argument with time zone could be specified. - if constexpr (std::is_same_v || std::is_same_v) - { - if ((time_zone_column = checkAndGetColumnConst(arguments[1].column.get()))) - { - auto non_null_args = createBlockWithNestedColumns(arguments); - time_zone = &extractTimeZoneFromFunctionArguments(non_null_args, 1, 0); - } - } - } - const auto & col_with_type_and_name = columnGetNested(datetime_arg); - - if (const auto col_from = checkAndGetColumn(col_with_type_and_name.column.get())) - { - auto col_to = ColumnString::create(); - - const typename ColVecType::Container & vec_from = col_from->getData(); - ColumnString::Chars & data_to = col_to->getChars(); - ColumnString::Offsets & offsets_to = col_to->getOffsets(); - size_t size = vec_from.size(); - - if constexpr (std::is_same_v) - data_to.resize(size * (strlen("YYYY-MM-DD") + 1)); - else if constexpr (std::is_same_v) - data_to.resize(size * (strlen("YYYY-MM-DD") + 1)); - else if constexpr (std::is_same_v) - data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1)); - else if constexpr (std::is_same_v) - data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1)); - else - data_to.resize(size * 3); /// Arbitrary - - offsets_to.resize(size); - - WriteBufferFromVector write_buffer(data_to); - const auto & type = static_cast(*col_with_type_and_name.type); - - ColumnUInt8::MutablePtr null_map = copyNullMap(datetime_arg.column); - - if (!null_map && arguments.size() > 1) - null_map = copyNullMap(arguments[1].column->convertToFullColumnIfConst()); - - if (null_map) - { - for (size_t i = 0; i < size; ++i) - { - if (!time_zone_column && arguments.size() > 1) - { - if (!arguments[1].column.get()->getDataAt(i).toString().empty()) - time_zone = &DateLUT::instance(arguments[1].column.get()->getDataAt(i).toString()); - else - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty"); - } - bool is_ok = FormatImpl::template execute(vec_from[i], write_buffer, &type, time_zone); - null_map->getData()[i] |= !is_ok; - writeChar(0, write_buffer); - offsets_to[i] = write_buffer.count(); - } - } - else - { - for (size_t i = 0; i < size; ++i) - { - if (!time_zone_column && arguments.size() > 1) - { - if (!arguments[1].column.get()->getDataAt(i).toString().empty()) - time_zone = &DateLUT::instance(arguments[1].column.get()->getDataAt(i).toString()); - else - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Provided time zone must be non-empty"); - } - FormatImpl::template execute(vec_from[i], write_buffer, &type, time_zone); - writeChar(0, write_buffer); - offsets_to[i] = write_buffer.count(); - } - } - - write_buffer.finalize(); - - if (null_map) - return ColumnNullable::create(std::move(col_to), std::move(null_map)); - return col_to; - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - arguments[0].column->getName(), Name::name); - } - else - { - ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); - - const auto & col_with_type_and_name = columnGetNested(arguments[0]); - const auto & type = static_cast(*col_with_type_and_name.type); - - if (const auto col_from = checkAndGetColumn(col_with_type_and_name.column.get())) - { - auto col_to = ColumnString::create(); - - const typename ColVecType::Container & vec_from = col_from->getData(); - ColumnString::Chars & data_to = col_to->getChars(); - ColumnString::Offsets & offsets_to = col_to->getOffsets(); - size_t size = vec_from.size(); - - data_to.resize(size * 3); - offsets_to.resize(size); - - WriteBufferFromVector write_buffer(data_to); - - if (null_map) - { - for (size_t i = 0; i < size; ++i) - { - bool is_ok = FormatImpl::template execute(vec_from[i], write_buffer, &type, nullptr); - /// We don't use timezones in this branch - null_map->getData()[i] |= !is_ok; - writeChar(0, write_buffer); - offsets_to[i] = write_buffer.count(); - } - } - else - { - for (size_t i = 0; i < size; ++i) - { - FormatImpl::template execute(vec_from[i], write_buffer, &type, nullptr); - writeChar(0, write_buffer); - offsets_to[i] = write_buffer.count(); - } - } - - write_buffer.finalize(); - - if (null_map) - return ColumnNullable::create(std::move(col_to), std::move(null_map)); - return col_to; - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - arguments[0].column->getName(), Name::name); - } - } -}; - - -/// Generic conversion of any type to String or FixedString via serialization to text. -template -struct ConvertImplGenericToString -{ - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) - { - static_assert(std::is_same_v || std::is_same_v, - "Can be used only to serialize to ColumnString or ColumnFixedString"); - - ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); - - const auto & col_with_type_and_name = columnGetNested(arguments[0]); - const IDataType & type = *col_with_type_and_name.type; - const IColumn & col_from = *col_with_type_and_name.column; - - size_t size = col_from.size(); - auto col_to = removeNullable(result_type)->createColumn(); - - { - ColumnStringHelpers::WriteHelper write_helper( - assert_cast(*col_to), - size); - - auto & write_buffer = write_helper.getWriteBuffer(); - - FormatSettings format_settings; - auto serialization = type.getDefaultSerialization(); - for (size_t row = 0; row < size; ++row) - { - serialization->serializeText(col_from, row, write_buffer, format_settings); - write_helper.rowWritten(); - } - - write_helper.finalize(); - } - - if (result_type->isNullable() && null_map) - return ColumnNullable::create(std::move(col_to), std::move(null_map)); - return col_to; - } -}; - -/** Conversion of time_t to UInt16, Int32, UInt32 - */ -template -void convertFromTime(typename DataType::FieldType & x, time_t & time) -{ - x = time; -} - -template <> -inline void convertFromTime(DataTypeDate::FieldType & x, time_t & time) -{ - if (unlikely(time < 0)) - x = 0; - else if (unlikely(time > 0xFFFF)) - x = 0xFFFF; - else - x = time; -} - -template <> -inline void convertFromTime(DataTypeDate32::FieldType & x, time_t & time) -{ - x = static_cast(time); -} - -template <> -inline void convertFromTime(DataTypeDateTime::FieldType & x, time_t & time) -{ - if (unlikely(time < 0)) - x = 0; - else if (unlikely(time > MAX_DATETIME_TIMESTAMP)) - x = MAX_DATETIME_TIMESTAMP; - else - x = static_cast(time); -} - -/** Conversion of strings to numbers, dates, datetimes: through parsing. - */ -template -void parseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing) -{ - if constexpr (std::is_floating_point_v) - { - if (precise_float_parsing) - readFloatTextPrecise(x, rb); - else - readFloatTextFast(x, rb); - } - else - readText(x, rb); -} - -template <> -inline void parseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) -{ - DayNum tmp(0); - readDateText(tmp, rb, *time_zone); - x = tmp; -} - -template <> -inline void parseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) -{ - ExtendedDayNum tmp(0); - readDateText(tmp, rb, *time_zone); - x = tmp; -} - - -// NOTE: no need of extra overload of DateTime64, since readDateTimeText64 has different signature and that case is explicitly handled in the calling code. -template <> -inline void parseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) -{ - time_t time = 0; - readDateTimeText(time, rb, *time_zone); - convertFromTime(x, time); -} - -template <> -inline void parseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) -{ - UUID tmp; - readUUIDText(tmp, rb); - x = tmp.toUnderType(); -} - -template <> -inline void parseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) -{ - IPv4 tmp; - readIPv4Text(tmp, rb); - x = tmp.toUnderType(); -} - -template <> -inline void parseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) -{ - IPv6 tmp; - readIPv6Text(tmp, rb); - x = tmp; -} - -template -bool tryParseImpl(typename DataType::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool precise_float_parsing) -{ - if constexpr (std::is_floating_point_v) - { - if (precise_float_parsing) - return tryReadFloatTextPrecise(x, rb); - else - return tryReadFloatTextFast(x, rb); - } - else /*if constexpr (is_integer_v)*/ - return tryReadIntText(x, rb); -} - -template <> -inline bool tryParseImpl(DataTypeDate::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) -{ - DayNum tmp(0); - if (!tryReadDateText(tmp, rb, *time_zone)) - return false; - x = tmp; - return true; -} - -template <> -inline bool tryParseImpl(DataTypeDate32::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) -{ - ExtendedDayNum tmp(0); - if (!tryReadDateText(tmp, rb, *time_zone)) - return false; - x = tmp; - return true; -} - -template <> -inline bool tryParseImpl(DataTypeDateTime::FieldType & x, ReadBuffer & rb, const DateLUTImpl * time_zone, bool) -{ - time_t time = 0; - if (!tryReadDateTimeText(time, rb, *time_zone)) - return false; - convertFromTime(x, time); - return true; -} - -template <> -inline bool tryParseImpl(DataTypeUUID::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) -{ - UUID tmp; - if (!tryReadUUIDText(tmp, rb)) - return false; - - x = tmp.toUnderType(); - return true; -} - -template <> -inline bool tryParseImpl(DataTypeIPv4::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) -{ - IPv4 tmp; - if (!tryReadIPv4Text(tmp, rb)) - return false; - - x = tmp.toUnderType(); - return true; -} - -template <> -inline bool tryParseImpl(DataTypeIPv6::FieldType & x, ReadBuffer & rb, const DateLUTImpl *, bool) -{ - IPv6 tmp; - if (!tryReadIPv6Text(tmp, rb)) - return false; - - x = tmp; - return true; -} - - -/** Throw exception with verbose message when string value is not parsed completely. - */ -[[noreturn]] inline void throwExceptionForIncompletelyParsedValue(ReadBuffer & read_buffer, const IDataType & result_type) -{ - WriteBufferFromOwnString message_buf; - message_buf << "Cannot parse string " << quote << String(read_buffer.buffer().begin(), read_buffer.buffer().size()) - << " as " << result_type.getName() - << ": syntax error"; - - if (read_buffer.offset()) - message_buf << " at position " << read_buffer.offset() - << " (parsed just " << quote << String(read_buffer.buffer().begin(), read_buffer.offset()) << ")"; - else - message_buf << " at begin of string"; - - // Currently there are no functions toIPv{4,6}Or{Null,Zero} - if (isNativeNumber(result_type) && !(result_type.getName() == "IPv4" || result_type.getName() == "IPv6")) - message_buf << ". Note: there are to" << result_type.getName() << "OrZero and to" << result_type.getName() << "OrNull functions, which returns zero/NULL instead of throwing exception."; - - throw Exception(PreformattedMessage{message_buf.str(), "Cannot parse string {} as {}: syntax error {}"}, ErrorCodes::CANNOT_PARSE_TEXT); -} - - -enum class ConvertFromStringExceptionMode -{ - Throw, /// Throw exception if value cannot be parsed. - Zero, /// Fill with zero or default if value cannot be parsed. - Null /// Return ColumnNullable with NULLs when value cannot be parsed. -}; - -enum class ConvertFromStringParsingMode -{ - Normal, - BestEffort, /// Only applicable for DateTime. Will use sophisticated method, that is slower. - BestEffortUS -}; - -template -struct ConvertThroughParsing -{ - static_assert(std::is_same_v || std::is_same_v, - "ConvertThroughParsing is only applicable for String or FixedString data types"); - - static constexpr bool to_datetime64 = std::is_same_v; - - static bool isAllRead(ReadBuffer & in) - { - /// In case of FixedString, skip zero bytes at end. - if constexpr (std::is_same_v) - while (!in.eof() && *in.position() == 0) - ++in.position(); - - if (in.eof()) - return true; - - /// Special case, that allows to parse string with DateTime or DateTime64 as Date or Date32. - if constexpr (std::is_same_v || std::is_same_v) - { - if (!in.eof() && (*in.position() == ' ' || *in.position() == 'T')) - { - if (in.buffer().size() == strlen("YYYY-MM-DD hh:mm:ss")) - return true; - - if (in.buffer().size() >= strlen("YYYY-MM-DD hh:mm:ss.x") - && in.buffer().begin()[19] == '.') - { - in.position() = in.buffer().begin() + 20; - - while (!in.eof() && isNumericASCII(*in.position())) - ++in.position(); - - if (in.eof()) - return true; - } - } - } - - return false; - } - - template - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, size_t input_rows_count, - Additions additions [[maybe_unused]] = Additions()) - { - using ColVecTo = typename ToDataType::ColumnType; - - const DateLUTImpl * local_time_zone [[maybe_unused]] = nullptr; - const DateLUTImpl * utc_time_zone [[maybe_unused]] = nullptr; - - /// For conversion to Date or DateTime type, second argument with time zone could be specified. - if constexpr (std::is_same_v || to_datetime64) - { - const auto result_type = removeNullable(res_type); - // Time zone is already figured out during result type resolution, no need to do it here. - if (const auto dt_col = checkAndGetDataType(result_type.get())) - local_time_zone = &dt_col->getTimeZone(); - else - local_time_zone = &extractTimeZoneFromFunctionArguments(arguments, 1, 0); - - if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffort || parsing_mode == ConvertFromStringParsingMode::BestEffortUS) - utc_time_zone = &DateLUT::instance("UTC"); - } - else if constexpr (std::is_same_v || std::is_same_v) - { - // Timezone is more or less dummy when parsing Date/Date32 from string. - local_time_zone = &DateLUT::instance(); - utc_time_zone = &DateLUT::instance("UTC"); - } - - const IColumn * col_from = arguments[0].column.get(); - const ColumnString * col_from_string = checkAndGetColumn(col_from); - const ColumnFixedString * col_from_fixed_string = checkAndGetColumn(col_from); - - if (std::is_same_v && !col_from_string) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - col_from->getName(), Name::name); - - if (std::is_same_v && !col_from_fixed_string) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - col_from->getName(), Name::name); - - size_t size = input_rows_count; - typename ColVecTo::MutablePtr col_to = nullptr; - - if constexpr (IsDataTypeDecimal) - { - UInt32 scale = additions; - if constexpr (to_datetime64) - { - ToDataType check_bounds_in_ctor(scale, local_time_zone ? local_time_zone->getTimeZone() : String{}); - } - else - { - ToDataType check_bounds_in_ctor(ToDataType::maxPrecision(), scale); - } - col_to = ColVecTo::create(size, scale); - } - else - col_to = ColVecTo::create(size); - - typename ColVecTo::Container & vec_to = col_to->getData(); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; - if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) - { - col_null_map_to = ColumnUInt8::create(size); - vec_null_map_to = &col_null_map_to->getData(); - } - - const ColumnString::Chars * chars = nullptr; - const IColumn::Offsets * offsets = nullptr; - size_t fixed_string_size = 0; - - if constexpr (std::is_same_v) - { - chars = &col_from_string->getChars(); - offsets = &col_from_string->getOffsets(); - } - else - { - chars = &col_from_fixed_string->getChars(); - fixed_string_size = col_from_fixed_string->getN(); - } - - size_t current_offset = 0; - - bool precise_float_parsing = false; - - if (DB::CurrentThread::isInitialized()) - { - const DB::ContextPtr query_context = DB::CurrentThread::get().getQueryContext(); - - if (query_context) - precise_float_parsing = query_context->getSettingsRef().precise_float_parsing; - } - - for (size_t i = 0; i < size; ++i) - { - size_t next_offset = std::is_same_v ? (*offsets)[i] : (current_offset + fixed_string_size); - size_t string_size = std::is_same_v ? next_offset - current_offset - 1 : fixed_string_size; - - ReadBufferFromMemory read_buffer(&(*chars)[current_offset], string_size); - - if constexpr (exception_mode == ConvertFromStringExceptionMode::Throw) - { - if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffort) - { - if constexpr (to_datetime64) - { - DateTime64 res = 0; - parseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); - vec_to[i] = res; - } - else - { - time_t res; - parseDateTimeBestEffort(res, read_buffer, *local_time_zone, *utc_time_zone); - convertFromTime(vec_to[i], res); - } - } - else if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffortUS) - { - if constexpr (to_datetime64) - { - DateTime64 res = 0; - parseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); - vec_to[i] = res; - } - else - { - time_t res; - parseDateTimeBestEffortUS(res, read_buffer, *local_time_zone, *utc_time_zone); - convertFromTime(vec_to[i], res); - } - } - else - { - if constexpr (to_datetime64) - { - DateTime64 value = 0; - readDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone); - vec_to[i] = value; - } - else if constexpr (IsDataTypeDecimal) - { - SerializationDecimal::readText( - vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale()); - } - else - { - /// we want to utilize constexpr condition here, which is not mixable with value comparison - do - { - if constexpr (std::is_same_v && std::is_same_v) - { - if (fixed_string_size == IPV6_BINARY_LENGTH) - { - readBinary(vec_to[i], read_buffer); - break; - } - } - parseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); - } while (false); - } - } - - if (!isAllRead(read_buffer)) - throwExceptionForIncompletelyParsedValue(read_buffer, *res_type); - } - else - { - bool parsed; - - if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffort) - { - if constexpr (to_datetime64) - { - DateTime64 res = 0; - parsed = tryParseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); - vec_to[i] = res; - } - else - { - time_t res; - parsed = tryParseDateTimeBestEffort(res, read_buffer, *local_time_zone, *utc_time_zone); - convertFromTime(vec_to[i],res); - } - } - else if constexpr (parsing_mode == ConvertFromStringParsingMode::BestEffortUS) - { - if constexpr (to_datetime64) - { - DateTime64 res = 0; - parsed = tryParseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone); - vec_to[i] = res; - } - else - { - time_t res; - parsed = tryParseDateTimeBestEffortUS(res, read_buffer, *local_time_zone, *utc_time_zone); - convertFromTime(vec_to[i],res); - } - } - else - { - if constexpr (to_datetime64) - { - DateTime64 value = 0; - parsed = tryReadDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone); - vec_to[i] = value; - } - else if constexpr (IsDataTypeDecimal) - { - parsed = SerializationDecimal::tryReadText( - vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale()); - } - else - { - /// we want to utilize constexpr condition here, which is not mixable with value comparison - do - { - if constexpr (std::is_same_v && std::is_same_v) - { - if (fixed_string_size == IPV6_BINARY_LENGTH) - { - readBinary(vec_to[i], read_buffer); - parsed = true; - break; - } - } - - parsed = tryParseImpl(vec_to[i], read_buffer, local_time_zone, precise_float_parsing); - } while (false); - } - } - - if (!isAllRead(read_buffer)) - parsed = false; - - if (!parsed) - { - if constexpr (std::is_same_v) - { - vec_to[i] = -static_cast(DateLUT::instance().getDayNumOffsetEpoch()); - } - else - { - vec_to[i] = static_cast(0); - } - } - - if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) - (*vec_null_map_to)[i] = !parsed; - } - - current_offset = next_offset; - } - - if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) - return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); - else - return col_to; - } -}; - - -template -requires (!std::is_same_v) -struct ConvertImpl - : ConvertThroughParsing {}; - -template -requires (!std::is_same_v) -struct ConvertImpl - : ConvertThroughParsing {}; - -template -requires (!std::is_same_v) -struct ConvertImpl - : ConvertThroughParsing {}; - -template -requires (!std::is_same_v) -struct ConvertImpl - : ConvertThroughParsing {}; - -template -requires (is_any_of && is_any_of) -struct ConvertImpl - : ConvertThroughParsing {}; - -/// Generic conversion of any type from String. Used for complex types: Array and Tuple or types with custom serialization. -struct ConvertImplGenericFromString -{ - static ColumnPtr execute(ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t input_rows_count) - { - const IColumn & column_from = *arguments[0].column; - const IDataType & data_type_to = *result_type; - auto res = data_type_to.createColumn(); - auto serialization = data_type_to.getDefaultSerialization(); - const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - - executeImpl(column_from, *res, *serialization, input_rows_count, null_map, result_type.get()); - return res; - } - - static void executeImpl( - const IColumn & column_from, - IColumn & column_to, - const ISerialization & serialization_from, - size_t input_rows_count, - const PaddedPODArray * null_map, - const IDataType * result_type) - { - column_to.reserve(input_rows_count); - - FormatSettings format_settings; - for (size_t i = 0; i < input_rows_count; ++i) - { - if (null_map && (*null_map)[i]) - { - column_to.insertDefault(); - continue; - } - - const auto & val = column_from.getDataAt(i); - ReadBufferFromMemory read_buffer(val.data, val.size); - try - { - serialization_from.deserializeWholeText(column_to, read_buffer, format_settings); - } - catch (const Exception & e) - { - auto * nullable_column = typeid_cast(&column_to); - if (e.code() == ErrorCodes::CANNOT_PARSE_BOOL && nullable_column) - { - auto & col_nullmap = nullable_column->getNullMapData(); - if (col_nullmap.size() != nullable_column->size()) - col_nullmap.resize_fill(nullable_column->size()); - if (nullable_column->size() == (i + 1)) - nullable_column->popBack(1); - nullable_column->insertDefault(); - continue; - } - throw; - } - - if (!read_buffer.eof()) - { - if (result_type) - throwExceptionForIncompletelyParsedValue(read_buffer, *result_type); - else - throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, - "Cannot parse string to column {}. Expected eof", column_to.getName()); - } - } - } -}; - - -template <> -struct ConvertImpl - : ConvertImpl {}; - -template <> -struct ConvertImpl - : ConvertImpl {}; - -/** If types are identical, just take reference to column. - */ -template -requires (!T::is_parametric) -struct ConvertImpl -{ - template - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, - Additions additions [[maybe_unused]] = Additions()) - { - return arguments[0].column; - } -}; - -template -struct ConvertImpl -{ - template - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, - Additions additions [[maybe_unused]] = Additions()) - { - - return arguments[0].column; - } -}; - - -/** Conversion from FixedString to String. - * Cutting sequences of zero bytes from end of strings. - */ -template -struct ConvertImpl -{ - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type, size_t /*input_rows_count*/) - { - ColumnUInt8::MutablePtr null_map = copyNullMap(arguments[0].column); - const auto & nested = columnGetNested(arguments[0]); - if (const ColumnFixedString * col_from = checkAndGetColumn(nested.column.get())) - { - auto col_to = ColumnString::create(); - - const ColumnFixedString::Chars & data_from = col_from->getChars(); - ColumnString::Chars & data_to = col_to->getChars(); - ColumnString::Offsets & offsets_to = col_to->getOffsets(); - size_t size = col_from->size(); - size_t n = col_from->getN(); - data_to.resize(size * (n + 1)); /// + 1 - zero terminator - offsets_to.resize(size); - - size_t offset_from = 0; - size_t offset_to = 0; - for (size_t i = 0; i < size; ++i) - { - if (!null_map || !null_map->getData()[i]) - { - size_t bytes_to_copy = n; - while (bytes_to_copy > 0 && data_from[offset_from + bytes_to_copy - 1] == 0) - --bytes_to_copy; - - memcpy(&data_to[offset_to], &data_from[offset_from], bytes_to_copy); - offset_to += bytes_to_copy; - } - data_to[offset_to] = 0; - ++offset_to; - offsets_to[i] = offset_to; - offset_from += n; - } - - data_to.resize(offset_to); - if (return_type->isNullable() && null_map) - return ColumnNullable::create(std::move(col_to), std::move(null_map)); - return col_to; - } - else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - arguments[0].column->getName(), Name::name); - } -}; - - -/// Declared early because used below. -struct NameToDate { static constexpr auto name = "toDate"; }; -struct NameToDate32 { static constexpr auto name = "toDate32"; }; -struct NameToDateTime { static constexpr auto name = "toDateTime"; }; -struct NameToDateTime32 { static constexpr auto name = "toDateTime32"; }; -struct NameToDateTime64 { static constexpr auto name = "toDateTime64"; }; -struct NameToString { static constexpr auto name = "toString"; }; -struct NameToDecimal32 { static constexpr auto name = "toDecimal32"; }; -struct NameToDecimal64 { static constexpr auto name = "toDecimal64"; }; -struct NameToDecimal128 { static constexpr auto name = "toDecimal128"; }; -struct NameToDecimal256 { static constexpr auto name = "toDecimal256"; }; - - -#define DEFINE_NAME_TO_INTERVAL(INTERVAL_KIND) \ - struct NameToInterval ## INTERVAL_KIND \ - { \ - static constexpr auto name = "toInterval" #INTERVAL_KIND; \ - static constexpr auto kind = IntervalKind::Kind::INTERVAL_KIND; \ - }; - -DEFINE_NAME_TO_INTERVAL(Nanosecond) -DEFINE_NAME_TO_INTERVAL(Microsecond) -DEFINE_NAME_TO_INTERVAL(Millisecond) -DEFINE_NAME_TO_INTERVAL(Second) -DEFINE_NAME_TO_INTERVAL(Minute) -DEFINE_NAME_TO_INTERVAL(Hour) -DEFINE_NAME_TO_INTERVAL(Day) -DEFINE_NAME_TO_INTERVAL(Week) -DEFINE_NAME_TO_INTERVAL(Month) -DEFINE_NAME_TO_INTERVAL(Quarter) -DEFINE_NAME_TO_INTERVAL(Year) - -#undef DEFINE_NAME_TO_INTERVAL - -struct NameParseDateTimeBestEffort; -struct NameParseDateTimeBestEffortOrZero; -struct NameParseDateTimeBestEffortOrNull; - -template -static inline bool isDateTime64(const ColumnsWithTypeAndName & arguments) -{ - if constexpr (std::is_same_v) - return true; - else if constexpr (std::is_same_v || std::is_same_v - || std::is_same_v || std::is_same_v) - { - return (arguments.size() == 2 && isUInt(arguments[1].type)) || arguments.size() == 3; - } - - return false; -} - -template -class FunctionConvert : public IFunction -{ -public: - using Monotonic = MonotonicityImpl; - - static constexpr auto name = Name::name; - static constexpr bool to_decimal = - std::is_same_v || std::is_same_v - || std::is_same_v || std::is_same_v; - - static constexpr bool to_datetime64 = std::is_same_v; - - static constexpr bool to_string_or_fixed_string = std::is_same_v || - std::is_same_v; - - static constexpr bool to_date_or_datetime = std::is_same_v || - std::is_same_v || - std::is_same_v; - - static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } - static FunctionPtr create() { return std::make_shared(); } - - FunctionConvert() = default; - explicit FunctionConvert(ContextPtr context_) : context(context_) {} - - String getName() const override - { - return name; - } - - bool isVariadic() const override { return true; } - size_t getNumberOfArguments() const override { return 0; } - bool isInjective(const ColumnsWithTypeAndName &) const override { return std::is_same_v; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & arguments) const override - { - /// TODO: We can make more optimizations here. - return !(to_date_or_datetime && isNumber(*arguments[0].type)); - } - - using DefaultReturnTypeGetter = std::function; - static DataTypePtr getReturnTypeDefaultImplementationForNulls(const ColumnsWithTypeAndName & arguments, const DefaultReturnTypeGetter & getter) - { - NullPresence null_presence = getNullPresense(arguments); - - if (null_presence.has_null_constant) - { - return makeNullable(std::make_shared()); - } - if (null_presence.has_nullable) - { - auto nested_columns = Block(createBlockWithNestedColumns(arguments)); - auto return_type = getter(ColumnsWithTypeAndName(nested_columns.begin(), nested_columns.end())); - return makeNullable(return_type); - } - - return getter(arguments); - } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - auto getter = [&] (const auto & args) { return getReturnTypeImplRemovedNullable(args); }; - auto res = getReturnTypeDefaultImplementationForNulls(arguments, getter); - to_nullable = res->isNullable(); - checked_return_type = true; - return res; - } - - DataTypePtr getReturnTypeImplRemovedNullable(const ColumnsWithTypeAndName & arguments) const - { - FunctionArgumentDescriptors mandatory_args = {{"Value", nullptr, nullptr, nullptr}}; - FunctionArgumentDescriptors optional_args; - - if constexpr (to_decimal) - { - mandatory_args.push_back({"scale", static_cast(&isNativeInteger), &isColumnConst, "const Integer"}); - } - - if (!to_decimal && isDateTime64(arguments)) - { - mandatory_args.push_back({"scale", static_cast(&isNativeInteger), &isColumnConst, "const Integer"}); - } - - // toString(DateTime or DateTime64, [timezone: String]) - if ((std::is_same_v && !arguments.empty() && (isDateTime64(arguments[0].type) || isDateTime(arguments[0].type))) - // toUnixTimestamp(value[, timezone : String]) - || std::is_same_v - // toDate(value[, timezone : String]) - || std::is_same_v // TODO: shall we allow timestamp argument for toDate? DateTime knows nothing about timezones and this argument is ignored below. - // toDate32(value[, timezone : String]) - || std::is_same_v - // toDateTime(value[, timezone: String]) - || std::is_same_v - // toDateTime64(value, scale : Integer[, timezone: String]) - || std::is_same_v) - { - optional_args.push_back({"timezone", static_cast(&isString), nullptr, "String"}); - } - - validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args); - - if constexpr (std::is_same_v) - { - return std::make_shared(Name::kind); - } - else if constexpr (to_decimal) - { - UInt64 scale = extractToDecimalScale(arguments[1]); - - if constexpr (std::is_same_v) - return createDecimalMaxPrecision(scale); - else if constexpr (std::is_same_v) - return createDecimalMaxPrecision(scale); - else if constexpr (std::is_same_v) - return createDecimalMaxPrecision(scale); - else if constexpr (std::is_same_v) - return createDecimalMaxPrecision(scale); - - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected branch in code of conversion function: it is a bug."); - } - else - { - // Optional second argument with time zone for DateTime. - UInt8 timezone_arg_position = 1; - UInt32 scale [[maybe_unused]] = DataTypeDateTime64::default_scale; - - // DateTime64 requires more arguments: scale and timezone. Since timezone is optional, scale should be first. - if (isDateTime64(arguments)) - { - timezone_arg_position += 1; - scale = static_cast(arguments[1].column->get64(0)); - - if (to_datetime64 || scale != 0) /// toDateTime('xxxx-xx-xx xx:xx:xx', 0) return DateTime - return std::make_shared(scale, - extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); - - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); - } - - if constexpr (std::is_same_v) - return std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0, false)); - else if constexpr (std::is_same_v) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected branch in code of conversion function: it is a bug."); - else - return std::make_shared(); - } - } - - /// Function actually uses default implementation for nulls, - /// but we need to know if return type is Nullable or not, - /// so we use checked_return_type only to intercept the first call to getReturnTypeImpl(...). - bool useDefaultImplementationForNulls() const override - { - bool to_nullable_string = to_nullable && std::is_same_v; - return checked_return_type && !to_nullable_string; - } - - bool useDefaultImplementationForConstants() const override { return true; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override - { - if constexpr (std::is_same_v) - return {}; - else if constexpr (std::is_same_v) - return {2}; - return {1}; - } - bool canBeExecutedOnDefaultArguments() const override { return false; } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override - { - try - { - return executeInternal(arguments, result_type, input_rows_count); - } - catch (Exception & e) - { - /// More convenient error message. - if (e.code() == ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF) - { - e.addMessage("Cannot parse " - + result_type->getName() + " from " - + arguments[0].type->getName() - + ", because value is too short"); - } - else if (e.code() == ErrorCodes::CANNOT_PARSE_NUMBER - || e.code() == ErrorCodes::CANNOT_READ_ARRAY_FROM_TEXT - || e.code() == ErrorCodes::CANNOT_PARSE_INPUT_ASSERTION_FAILED - || e.code() == ErrorCodes::CANNOT_PARSE_QUOTED_STRING - || e.code() == ErrorCodes::CANNOT_PARSE_ESCAPE_SEQUENCE - || e.code() == ErrorCodes::CANNOT_PARSE_DATE - || e.code() == ErrorCodes::CANNOT_PARSE_DATETIME - || e.code() == ErrorCodes::CANNOT_PARSE_UUID - || e.code() == ErrorCodes::CANNOT_PARSE_IPV4 - || e.code() == ErrorCodes::CANNOT_PARSE_IPV6) - { - e.addMessage("Cannot parse " - + result_type->getName() + " from " - + arguments[0].type->getName()); - } - - throw; - } - } - - bool hasInformationAboutMonotonicity() const override - { - return Monotonic::has(); - } - - Monotonicity getMonotonicityForRange(const IDataType & type, const Field & left, const Field & right) const override - { - return Monotonic::get(type, left, right); - } - -private: - ContextPtr context; - mutable bool checked_return_type = false; - mutable bool to_nullable = false; - - ColumnPtr executeInternal(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const - { - if (arguments.empty()) - throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects at least 1 argument", getName()); - - if (result_type->onlyNull()) - return result_type->createColumnConstWithDefaultValue(input_rows_count); - - const DataTypePtr from_type = removeNullable(arguments[0].type); - ColumnPtr result_column; - - [[maybe_unused]] FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior = default_date_time_overflow_behavior; - - if (context) - date_time_overflow_behavior = context->getSettingsRef().date_time_overflow_behavior.value; - - auto call = [&](const auto & types, const auto & tag) -> bool - { - using Types = std::decay_t; - using LeftDataType = typename Types::LeftType; - using RightDataType = typename Types::RightType; - using SpecialTag = std::decay_t; - - if constexpr (IsDataTypeDecimal) - { - if constexpr (std::is_same_v) - { - /// Account for optional timezone argument. - if (arguments.size() != 2 && arguments.size() != 3) - throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects 2 or 3 arguments for DataTypeDateTime64.", getName()); - } - else if (arguments.size() != 2) - { - throw Exception(ErrorCodes::TOO_FEW_ARGUMENTS_FOR_FUNCTION, "Function {} expects 2 arguments for Decimal.", getName()); - } - - const ColumnWithTypeAndName & scale_column = arguments[1]; - UInt32 scale = extractToDecimalScale(scale_column); - - switch (date_time_overflow_behavior) - { - case FormatSettings::DateTimeOverflowBehavior::Throw: - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); - break; - case FormatSettings::DateTimeOverflowBehavior::Ignore: - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); - break; - case FormatSettings::DateTimeOverflowBehavior::Saturate: - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); - break; - } - - } - else if constexpr (IsDataTypeDateOrDateTime && std::is_same_v) - { - const auto * dt64 = assert_cast(arguments[0].type.get()); - switch (date_time_overflow_behavior) - { - case FormatSettings::DateTimeOverflowBehavior::Throw: - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, dt64->getScale()); - break; - case FormatSettings::DateTimeOverflowBehavior::Ignore: - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, dt64->getScale()); - break; - case FormatSettings::DateTimeOverflowBehavior::Saturate: - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, dt64->getScale()); - break; - } - } -#define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE) \ - case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ - result_column = ConvertImpl::execute( \ - arguments, result_type, input_rows_count); \ - break; - - else if constexpr (IsDataTypeDecimalOrNumber && IsDataTypeDecimalOrNumber) - { - using LeftT = typename LeftDataType::FieldType; - using RightT = typename RightDataType::FieldType; - - static constexpr bool bad_left = - is_decimal || std::is_floating_point_v || is_big_int_v || is_signed_v; - static constexpr bool bad_right = - is_decimal || std::is_floating_point_v || is_big_int_v || is_signed_v; - - /// Disallow int vs UUID conversion (but support int vs UInt128 conversion) - if constexpr ((bad_left && std::is_same_v) || - (bad_right && std::is_same_v)) - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Wrong UUID conversion"); - } - else - { - switch (date_time_overflow_behavior) - { - GENERATE_OVERFLOW_MODE_CASE(Throw) - GENERATE_OVERFLOW_MODE_CASE(Ignore) - GENERATE_OVERFLOW_MODE_CASE(Saturate) - } - } - } - else if constexpr ((IsDataTypeNumber || IsDataTypeDateOrDateTime) - && IsDataTypeDateOrDateTime) - { - switch (date_time_overflow_behavior) - { - GENERATE_OVERFLOW_MODE_CASE(Throw) - GENERATE_OVERFLOW_MODE_CASE(Ignore) - GENERATE_OVERFLOW_MODE_CASE(Saturate) - } - } -#undef GENERATE_OVERFLOW_MODE_CASE - else - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count); - - return true; - }; - - if (isDateTime64(arguments)) - { - /// For toDateTime('xxxx-xx-xx xx:xx:xx.00', 2[, 'timezone']) we need to it convert to DateTime64 - const ColumnWithTypeAndName & scale_column = arguments[1]; - UInt32 scale = extractToDecimalScale(scale_column); - - if (to_datetime64 || scale != 0) /// When scale = 0, the data type is DateTime otherwise the data type is DateTime64 - { - if (!callOnIndexAndDataType(from_type->getTypeId(), call, ConvertDefaultBehaviorTag{})) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", - arguments[0].type->getName(), getName()); - - return result_column; - } - } - - if constexpr (std::is_same_v) - { - if (from_type->getCustomSerialization()) - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); - } - - bool done = false; - if constexpr (to_string_or_fixed_string) - { - done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertDefaultBehaviorTag{}); - } - else - { - bool cast_ipv4_ipv6_default_on_conversion_error = false; - if constexpr (is_any_of) - if (context && (cast_ipv4_ipv6_default_on_conversion_error = context->getSettingsRef().cast_ipv4_ipv6_default_on_conversion_error)) - done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertReturnZeroOnErrorTag{}); - - if (!cast_ipv4_ipv6_default_on_conversion_error) - { - /// We should use ConvertFromStringExceptionMode::Null mode when converting from String (or FixedString) - /// to Nullable type, to avoid 'value is too short' error on attempt to parse empty string from NULL values. - if (to_nullable && WhichDataType(from_type).isStringOrFixedString()) - done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertReturnNullOnErrorTag{}); - else - done = callOnIndexAndDataType(from_type->getTypeId(), call, ConvertDefaultBehaviorTag{}); - } - } - - if (!done) - { - /// Generic conversion of any type to String. - if (std::is_same_v) - { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); - } - else - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", - arguments[0].type->getName(), getName()); - } - - return result_column; - } -}; - - -/** Function toTOrZero (where T is number of date or datetime type): - * try to convert from String to type T through parsing, - * if cannot parse, return default value instead of throwing exception. - * Function toTOrNull will return Nullable type with NULL when cannot parse. - * NOTE Also need to implement tryToUnixTimestamp with timezone. - */ -template -class FunctionConvertFromString : public IFunction -{ -public: - static constexpr auto name = Name::name; - static constexpr bool to_decimal = - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v>; - - static constexpr bool to_datetime64 = std::is_same_v; - - static FunctionPtr create(ContextPtr) { return std::make_shared(); } - static FunctionPtr create() { return std::make_shared(); } - - String getName() const override - { - return name; - } - - bool isVariadic() const override { return true; } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - size_t getNumberOfArguments() const override { return 0; } - - bool useDefaultImplementationForConstants() const override { return true; } - bool canBeExecutedOnDefaultArguments() const override { return false; } - - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - - DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override - { - DataTypePtr res; - - if (isDateTime64(arguments)) - { - validateFunctionArgumentTypes(*this, arguments, - FunctionArgumentDescriptors{{"string", static_cast(&isStringOrFixedString), nullptr, "String or FixedString"}}, - // optional - FunctionArgumentDescriptors{ - {"precision", static_cast(&isUInt8), isColumnConst, "const UInt8"}, - {"timezone", static_cast(&isStringOrFixedString), isColumnConst, "const String or FixedString"}, - }); - - UInt64 scale = to_datetime64 ? DataTypeDateTime64::default_scale : 0; - if (arguments.size() > 1) - scale = extractToDecimalScale(arguments[1]); - const auto timezone = extractTimeZoneNameFromFunctionArguments(arguments, 2, 0, false); - - res = scale == 0 ? res = std::make_shared(timezone) : std::make_shared(scale, timezone); - } - else - { - if ((arguments.size() != 1 && arguments.size() != 2) || (to_decimal && arguments.size() != 2)) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 1 or 2. " - "Second argument only make sense for DateTime (time zone, optional) and Decimal (scale).", - getName(), arguments.size()); - - if (!isStringOrFixedString(arguments[0].type)) - { - if (this->getName().find("OrZero") != std::string::npos || - this->getName().find("OrNull") != std::string::npos) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}. " - "Conversion functions with postfix 'OrZero' or 'OrNull' should take String argument", - arguments[0].type->getName(), getName()); - else - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of first argument of function {}", - arguments[0].type->getName(), getName()); - } - - if (arguments.size() == 2) - { - if constexpr (std::is_same_v) - { - if (!isString(arguments[1].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}", - arguments[1].type->getName(), getName()); - } - else if constexpr (to_decimal) - { - if (!isInteger(arguments[1].type)) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of 2nd argument of function {}", - arguments[1].type->getName(), getName()); - if (!arguments[1].column) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant", getName()); - } - else - { - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, - "Number of arguments for function {} doesn't match: passed {}, should be 1. " - "Second argument makes sense only for DateTime and Decimal.", - getName(), arguments.size()); - } - } - - if constexpr (std::is_same_v) - res = std::make_shared(extractTimeZoneNameFromFunctionArguments(arguments, 1, 0, false)); - else if constexpr (std::is_same_v) - throw Exception(ErrorCodes::LOGICAL_ERROR, "MaterializedMySQL is a bug."); - else if constexpr (to_decimal) - { - UInt64 scale = extractToDecimalScale(arguments[1]); - res = createDecimalMaxPrecision(scale); - if (!res) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Something wrong with toDecimalNNOrZero() or toDecimalNNOrNull()"); - } - else - res = std::make_shared(); - } - - if constexpr (exception_mode == ConvertFromStringExceptionMode::Null) - res = std::make_shared(res); - - return res; - } - - template - ColumnPtr executeInternal(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count, UInt32 scale = 0) const - { - const IDataType * from_type = arguments[0].type.get(); - - if (checkAndGetDataType(from_type)) - { - return ConvertThroughParsing::execute( - arguments, result_type, input_rows_count, scale); - } - else if (checkAndGetDataType(from_type)) - { - return ConvertThroughParsing::execute( - arguments, result_type, input_rows_count, scale); - } - - return nullptr; - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override - { - ColumnPtr result_column; - - if constexpr (to_decimal) - result_column = executeInternal(arguments, result_type, input_rows_count, - assert_cast(*removeNullable(result_type)).getScale()); - else - { - if (isDateTime64(arguments)) - { - UInt64 scale = to_datetime64 ? DataTypeDateTime64::default_scale : 0; - if (arguments.size() > 1) - scale = extractToDecimalScale(arguments[1]); - - if (scale == 0) - result_column = executeInternal(arguments, result_type, input_rows_count); - else - { - result_column = executeInternal(arguments, result_type, input_rows_count, static_cast(scale)); - } - } - else - { - result_column = executeInternal(arguments, result_type, input_rows_count); - } - } - - if (!result_column) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}. " - "Only String or FixedString argument is accepted for try-conversion function. For other arguments, " - "use function without 'orZero' or 'orNull'.", arguments[0].type->getName(), getName()); - - return result_column; - } -}; - - -/// Monotonicity. - -struct PositiveMonotonicity -{ - static bool has() { return true; } - static IFunction::Monotonicity get(const IDataType &, const Field &, const Field &) - { - return { .is_monotonic = true }; - } -}; - -struct UnknownMonotonicity -{ - static bool has() { return false; } - static IFunction::Monotonicity get(const IDataType &, const Field &, const Field &) - { - return { }; - } -}; - -template -struct ToNumberMonotonicity -{ - static bool has() { return true; } - - static UInt64 divideByRangeOfType(UInt64 x) - { - if constexpr (sizeof(T) < sizeof(UInt64)) - return x >> (sizeof(T) * 8); - else - return 0; - } - - static IFunction::Monotonicity get(const IDataType & type, const Field & left, const Field & right) - { - if (!type.isValueRepresentedByNumber()) - return {}; - - /// If type is same, the conversion is always monotonic. - /// (Enum has separate case, because it is different data type) - if (checkAndGetDataType>(&type) || - checkAndGetDataType>(&type)) - return { .is_monotonic = true, .is_always_monotonic = true }; - - /// Float cases. - - /// When converting to Float, the conversion is always monotonic. - if constexpr (std::is_floating_point_v) - return { .is_monotonic = true, .is_always_monotonic = true }; - - const auto * low_cardinality = typeid_cast(&type); - const IDataType * low_cardinality_dictionary_type = nullptr; - if (low_cardinality) - low_cardinality_dictionary_type = low_cardinality->getDictionaryType().get(); - - WhichDataType which_type(type); - WhichDataType which_inner_type = low_cardinality - ? WhichDataType(low_cardinality_dictionary_type) - : WhichDataType(type); - - /// If converting from Float, for monotonicity, arguments must fit in range of result type. - if (which_inner_type.isFloat()) - { - if (left.isNull() || right.isNull()) - return {}; - - Float64 left_float = left.get(); - Float64 right_float = right.get(); - - if (left_float >= static_cast(std::numeric_limits::min()) - && left_float <= static_cast(std::numeric_limits::max()) - && right_float >= static_cast(std::numeric_limits::min()) - && right_float <= static_cast(std::numeric_limits::max())) - return { .is_monotonic = true }; - - return {}; - } - - /// Integer cases. - - /// Only support types represented by native integers. - /// It can be extended to big integers, decimals and DateTime64 later. - /// By the way, NULLs are representing unbounded ranges. - if (!((left.isNull() || left.getType() == Field::Types::UInt64 || left.getType() == Field::Types::Int64) - && (right.isNull() || right.getType() == Field::Types::UInt64 || right.getType() == Field::Types::Int64))) - return {}; - - const bool from_is_unsigned = type.isValueRepresentedByUnsignedInteger(); - const bool to_is_unsigned = is_unsigned_v; - - const size_t size_of_from = type.getSizeOfValueInMemory(); - const size_t size_of_to = sizeof(T); - - const bool left_in_first_half = left.isNull() - ? from_is_unsigned - : (left.get() >= 0); - - const bool right_in_first_half = right.isNull() - ? !from_is_unsigned - : (right.get() >= 0); - - /// Size of type is the same. - if (size_of_from == size_of_to) - { - if (from_is_unsigned == to_is_unsigned) - return { .is_monotonic = true, .is_always_monotonic = true }; - - if (left_in_first_half == right_in_first_half) - return { .is_monotonic = true }; - - return {}; - } - - /// Size of type is expanded. - if (size_of_from < size_of_to) - { - if (from_is_unsigned == to_is_unsigned) - return { .is_monotonic = true, .is_always_monotonic = true }; - - if (!to_is_unsigned) - return { .is_monotonic = true, .is_always_monotonic = true }; - - /// signed -> unsigned. If arguments from the same half, then function is monotonic. - if (left_in_first_half == right_in_first_half) - return { .is_monotonic = true }; - - return {}; - } - - /// Size of type is shrunk. - if (size_of_from > size_of_to) - { - /// Function cannot be monotonic on unbounded ranges. - if (left.isNull() || right.isNull()) - return {}; - - /// Function cannot be monotonic when left and right are not on the same ranges. - if (divideByRangeOfType(left.get()) != divideByRangeOfType(right.get())) - return {}; - - if (to_is_unsigned) - return { .is_monotonic = true }; - else - { - // If To is signed, it's possible that the signedness is different after conversion. So we check it explicitly. - const bool is_monotonic = (T(left.get()) >= 0) == (T(right.get()) >= 0); - - return { .is_monotonic = is_monotonic }; - } - } - - UNREACHABLE(); - } -}; - -struct ToDateMonotonicity -{ - static bool has() { return true; } - - static IFunction::Monotonicity get(const IDataType & type, const Field & left, const Field & right) - { - auto which = WhichDataType(type); - if (which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() || which.isInt8() || which.isInt16() || which.isUInt8() - || which.isUInt16()) - { - return {.is_monotonic = true, .is_always_monotonic = true}; - } - else if ( - ((left.getType() == Field::Types::UInt64 || left.isNull()) && (right.getType() == Field::Types::UInt64 || right.isNull()) - && ((left.isNull() || left.get() < 0xFFFF) && (right.isNull() || right.get() >= 0xFFFF))) - || ((left.getType() == Field::Types::Int64 || left.isNull()) && (right.getType() == Field::Types::Int64 || right.isNull()) - && ((left.isNull() || left.get() < 0xFFFF) && (right.isNull() || right.get() >= 0xFFFF))) - || (( - (left.getType() == Field::Types::Float64 || left.isNull()) - && (right.getType() == Field::Types::Float64 || right.isNull()) - && ((left.isNull() || left.get() < 0xFFFF) && (right.isNull() || right.get() >= 0xFFFF)))) - || !isNativeNumber(type)) - { - return {}; - } - else - { - return {.is_monotonic = true, .is_always_monotonic = true}; - } - } -}; - -struct ToDateTimeMonotonicity -{ - static bool has() { return true; } - - static IFunction::Monotonicity get(const IDataType & type, const Field &, const Field &) - { - if (type.isValueRepresentedByNumber()) - return {.is_monotonic = true, .is_always_monotonic = true}; - else - return {}; - } -}; - -/** The monotonicity for the `toString` function is mainly determined for test purposes. - * It is doubtful that anyone is looking to optimize queries with conditions `toString(CounterID) = 34`. - */ -struct ToStringMonotonicity -{ - static bool has() { return true; } - - static IFunction::Monotonicity get(const IDataType & type, const Field & left, const Field & right) - { - IFunction::Monotonicity positive{ .is_monotonic = true }; - IFunction::Monotonicity not_monotonic; - - const auto * type_ptr = &type; - if (const auto * low_cardinality_type = checkAndGetDataType(type_ptr)) - type_ptr = low_cardinality_type->getDictionaryType().get(); - - /// Order on enum values (which is the order on integers) is completely arbitrary in respect to the order on strings. - if (WhichDataType(type).isEnum()) - return not_monotonic; - - /// `toString` function is monotonous if the argument is Date or Date32 or DateTime or String, or non-negative numbers with the same number of symbols. - if (checkDataTypes(type_ptr)) - return positive; - - if (left.isNull() || right.isNull()) - return {}; - - if (left.getType() == Field::Types::UInt64 - && right.getType() == Field::Types::UInt64) - { - return (left.get() == 0 && right.get() == 0) - || (floor(log10(left.get())) == floor(log10(right.get()))) - ? positive : not_monotonic; - } - - if (left.getType() == Field::Types::Int64 - && right.getType() == Field::Types::Int64) - { - return (left.get() == 0 && right.get() == 0) - || (left.get() > 0 && right.get() > 0 && floor(log10(left.get())) == floor(log10(right.get()))) - ? positive : not_monotonic; - } - - return not_monotonic; - } -}; - - -struct NameToUInt8 { static constexpr auto name = "toUInt8"; }; -struct NameToUInt16 { static constexpr auto name = "toUInt16"; }; -struct NameToUInt32 { static constexpr auto name = "toUInt32"; }; -struct NameToUInt64 { static constexpr auto name = "toUInt64"; }; -struct NameToUInt128 { static constexpr auto name = "toUInt128"; }; -struct NameToUInt256 { static constexpr auto name = "toUInt256"; }; -struct NameToInt8 { static constexpr auto name = "toInt8"; }; -struct NameToInt16 { static constexpr auto name = "toInt16"; }; -struct NameToInt32 { static constexpr auto name = "toInt32"; }; -struct NameToInt64 { static constexpr auto name = "toInt64"; }; -struct NameToInt128 { static constexpr auto name = "toInt128"; }; -struct NameToInt256 { static constexpr auto name = "toInt256"; }; -struct NameToFloat32 { static constexpr auto name = "toFloat32"; }; -struct NameToFloat64 { static constexpr auto name = "toFloat64"; }; -struct NameToUUID { static constexpr auto name = "toUUID"; }; -struct NameToIPv4 { static constexpr auto name = "toIPv4"; }; -struct NameToIPv6 { static constexpr auto name = "toIPv6"; }; - -using FunctionToUInt8 = FunctionConvert>; -using FunctionToUInt16 = FunctionConvert>; -using FunctionToUInt32 = FunctionConvert>; -using FunctionToUInt64 = FunctionConvert>; -using FunctionToUInt128 = FunctionConvert>; -using FunctionToUInt256 = FunctionConvert>; -using FunctionToInt8 = FunctionConvert>; -using FunctionToInt16 = FunctionConvert>; -using FunctionToInt32 = FunctionConvert>; -using FunctionToInt64 = FunctionConvert>; -using FunctionToInt128 = FunctionConvert>; -using FunctionToInt256 = FunctionConvert>; -using FunctionToFloat32 = FunctionConvert>; -using FunctionToFloat64 = FunctionConvert>; - -using FunctionToDate = FunctionConvert; - -using FunctionToDate32 = FunctionConvert; - -using FunctionToDateTime = FunctionConvert; - -using FunctionToDateTime32 = FunctionConvert; - -using FunctionToDateTime64 = FunctionConvert; - -using FunctionToUUID = FunctionConvert>; -using FunctionToIPv4 = FunctionConvert>; -using FunctionToIPv6 = FunctionConvert>; -using FunctionToString = FunctionConvert; -using FunctionToUnixTimestamp = FunctionConvert>; -using FunctionToDecimal32 = FunctionConvert, NameToDecimal32, UnknownMonotonicity>; -using FunctionToDecimal64 = FunctionConvert, NameToDecimal64, UnknownMonotonicity>; -using FunctionToDecimal128 = FunctionConvert, NameToDecimal128, UnknownMonotonicity>; -using FunctionToDecimal256 = FunctionConvert, NameToDecimal256, UnknownMonotonicity>; - -template struct FunctionTo; - -template <> struct FunctionTo { using Type = FunctionToUInt8; }; -template <> struct FunctionTo { using Type = FunctionToUInt16; }; -template <> struct FunctionTo { using Type = FunctionToUInt32; }; -template <> struct FunctionTo { using Type = FunctionToUInt64; }; -template <> struct FunctionTo { using Type = FunctionToUInt128; }; -template <> struct FunctionTo { using Type = FunctionToUInt256; }; -template <> struct FunctionTo { using Type = FunctionToInt8; }; -template <> struct FunctionTo { using Type = FunctionToInt16; }; -template <> struct FunctionTo { using Type = FunctionToInt32; }; -template <> struct FunctionTo { using Type = FunctionToInt64; }; -template <> struct FunctionTo { using Type = FunctionToInt128; }; -template <> struct FunctionTo { using Type = FunctionToInt256; }; -template <> struct FunctionTo { using Type = FunctionToFloat32; }; -template <> struct FunctionTo { using Type = FunctionToFloat64; }; - -template -struct FunctionTo { using Type = FunctionToDate; }; - -template -struct FunctionTo { using Type = FunctionToDate32; }; - -template -struct FunctionTo { using Type = FunctionToDateTime; }; - -template -struct FunctionTo { using Type = FunctionToDateTime64; }; - -template <> struct FunctionTo { using Type = FunctionToUUID; }; -template <> struct FunctionTo { using Type = FunctionToIPv4; }; -template <> struct FunctionTo { using Type = FunctionToIPv6; }; -template <> struct FunctionTo { using Type = FunctionToString; }; -template <> struct FunctionTo { using Type = FunctionToFixedString; }; -template <> struct FunctionTo> { using Type = FunctionToDecimal32; }; -template <> struct FunctionTo> { using Type = FunctionToDecimal64; }; -template <> struct FunctionTo> { using Type = FunctionToDecimal128; }; -template <> struct FunctionTo> { using Type = FunctionToDecimal256; }; - -template struct FunctionTo> - : FunctionTo> -{ -}; - -struct NameToUInt8OrZero { static constexpr auto name = "toUInt8OrZero"; }; -struct NameToUInt16OrZero { static constexpr auto name = "toUInt16OrZero"; }; -struct NameToUInt32OrZero { static constexpr auto name = "toUInt32OrZero"; }; -struct NameToUInt64OrZero { static constexpr auto name = "toUInt64OrZero"; }; -struct NameToUInt128OrZero { static constexpr auto name = "toUInt128OrZero"; }; -struct NameToUInt256OrZero { static constexpr auto name = "toUInt256OrZero"; }; -struct NameToInt8OrZero { static constexpr auto name = "toInt8OrZero"; }; -struct NameToInt16OrZero { static constexpr auto name = "toInt16OrZero"; }; -struct NameToInt32OrZero { static constexpr auto name = "toInt32OrZero"; }; -struct NameToInt64OrZero { static constexpr auto name = "toInt64OrZero"; }; -struct NameToInt128OrZero { static constexpr auto name = "toInt128OrZero"; }; -struct NameToInt256OrZero { static constexpr auto name = "toInt256OrZero"; }; -struct NameToFloat32OrZero { static constexpr auto name = "toFloat32OrZero"; }; -struct NameToFloat64OrZero { static constexpr auto name = "toFloat64OrZero"; }; -struct NameToDateOrZero { static constexpr auto name = "toDateOrZero"; }; -struct NameToDate32OrZero { static constexpr auto name = "toDate32OrZero"; }; -struct NameToDateTimeOrZero { static constexpr auto name = "toDateTimeOrZero"; }; -struct NameToDateTime64OrZero { static constexpr auto name = "toDateTime64OrZero"; }; -struct NameToDecimal32OrZero { static constexpr auto name = "toDecimal32OrZero"; }; -struct NameToDecimal64OrZero { static constexpr auto name = "toDecimal64OrZero"; }; -struct NameToDecimal128OrZero { static constexpr auto name = "toDecimal128OrZero"; }; -struct NameToDecimal256OrZero { static constexpr auto name = "toDecimal256OrZero"; }; -struct NameToUUIDOrZero { static constexpr auto name = "toUUIDOrZero"; }; -struct NameToIPv4OrZero { static constexpr auto name = "toIPv4OrZero"; }; -struct NameToIPv6OrZero { static constexpr auto name = "toIPv6OrZero"; }; - -using FunctionToUInt8OrZero = FunctionConvertFromString; -using FunctionToUInt16OrZero = FunctionConvertFromString; -using FunctionToUInt32OrZero = FunctionConvertFromString; -using FunctionToUInt64OrZero = FunctionConvertFromString; -using FunctionToUInt128OrZero = FunctionConvertFromString; -using FunctionToUInt256OrZero = FunctionConvertFromString; -using FunctionToInt8OrZero = FunctionConvertFromString; -using FunctionToInt16OrZero = FunctionConvertFromString; -using FunctionToInt32OrZero = FunctionConvertFromString; -using FunctionToInt64OrZero = FunctionConvertFromString; -using FunctionToInt128OrZero = FunctionConvertFromString; -using FunctionToInt256OrZero = FunctionConvertFromString; -using FunctionToFloat32OrZero = FunctionConvertFromString; -using FunctionToFloat64OrZero = FunctionConvertFromString; -using FunctionToDateOrZero = FunctionConvertFromString; -using FunctionToDate32OrZero = FunctionConvertFromString; -using FunctionToDateTimeOrZero = FunctionConvertFromString; -using FunctionToDateTime64OrZero = FunctionConvertFromString; -using FunctionToDecimal32OrZero = FunctionConvertFromString, NameToDecimal32OrZero, ConvertFromStringExceptionMode::Zero>; -using FunctionToDecimal64OrZero = FunctionConvertFromString, NameToDecimal64OrZero, ConvertFromStringExceptionMode::Zero>; -using FunctionToDecimal128OrZero = FunctionConvertFromString, NameToDecimal128OrZero, ConvertFromStringExceptionMode::Zero>; -using FunctionToDecimal256OrZero = FunctionConvertFromString, NameToDecimal256OrZero, ConvertFromStringExceptionMode::Zero>; -using FunctionToUUIDOrZero = FunctionConvertFromString; -using FunctionToIPv4OrZero = FunctionConvertFromString; -using FunctionToIPv6OrZero = FunctionConvertFromString; - -struct NameToUInt8OrNull { static constexpr auto name = "toUInt8OrNull"; }; -struct NameToUInt16OrNull { static constexpr auto name = "toUInt16OrNull"; }; -struct NameToUInt32OrNull { static constexpr auto name = "toUInt32OrNull"; }; -struct NameToUInt64OrNull { static constexpr auto name = "toUInt64OrNull"; }; -struct NameToUInt128OrNull { static constexpr auto name = "toUInt128OrNull"; }; -struct NameToUInt256OrNull { static constexpr auto name = "toUInt256OrNull"; }; -struct NameToInt8OrNull { static constexpr auto name = "toInt8OrNull"; }; -struct NameToInt16OrNull { static constexpr auto name = "toInt16OrNull"; }; -struct NameToInt32OrNull { static constexpr auto name = "toInt32OrNull"; }; -struct NameToInt64OrNull { static constexpr auto name = "toInt64OrNull"; }; -struct NameToInt128OrNull { static constexpr auto name = "toInt128OrNull"; }; -struct NameToInt256OrNull { static constexpr auto name = "toInt256OrNull"; }; -struct NameToFloat32OrNull { static constexpr auto name = "toFloat32OrNull"; }; -struct NameToFloat64OrNull { static constexpr auto name = "toFloat64OrNull"; }; -struct NameToDateOrNull { static constexpr auto name = "toDateOrNull"; }; -struct NameToDate32OrNull { static constexpr auto name = "toDate32OrNull"; }; -struct NameToDateTimeOrNull { static constexpr auto name = "toDateTimeOrNull"; }; -struct NameToDateTime64OrNull { static constexpr auto name = "toDateTime64OrNull"; }; -struct NameToDecimal32OrNull { static constexpr auto name = "toDecimal32OrNull"; }; -struct NameToDecimal64OrNull { static constexpr auto name = "toDecimal64OrNull"; }; -struct NameToDecimal128OrNull { static constexpr auto name = "toDecimal128OrNull"; }; -struct NameToDecimal256OrNull { static constexpr auto name = "toDecimal256OrNull"; }; -struct NameToUUIDOrNull { static constexpr auto name = "toUUIDOrNull"; }; -struct NameToIPv4OrNull { static constexpr auto name = "toIPv4OrNull"; }; -struct NameToIPv6OrNull { static constexpr auto name = "toIPv6OrNull"; }; - -using FunctionToUInt8OrNull = FunctionConvertFromString; -using FunctionToUInt16OrNull = FunctionConvertFromString; -using FunctionToUInt32OrNull = FunctionConvertFromString; -using FunctionToUInt64OrNull = FunctionConvertFromString; -using FunctionToUInt128OrNull = FunctionConvertFromString; -using FunctionToUInt256OrNull = FunctionConvertFromString; -using FunctionToInt8OrNull = FunctionConvertFromString; -using FunctionToInt16OrNull = FunctionConvertFromString; -using FunctionToInt32OrNull = FunctionConvertFromString; -using FunctionToInt64OrNull = FunctionConvertFromString; -using FunctionToInt128OrNull = FunctionConvertFromString; -using FunctionToInt256OrNull = FunctionConvertFromString; -using FunctionToFloat32OrNull = FunctionConvertFromString; -using FunctionToFloat64OrNull = FunctionConvertFromString; -using FunctionToDateOrNull = FunctionConvertFromString; -using FunctionToDate32OrNull = FunctionConvertFromString; -using FunctionToDateTimeOrNull = FunctionConvertFromString; -using FunctionToDateTime64OrNull = FunctionConvertFromString; -using FunctionToDecimal32OrNull = FunctionConvertFromString, NameToDecimal32OrNull, ConvertFromStringExceptionMode::Null>; -using FunctionToDecimal64OrNull = FunctionConvertFromString, NameToDecimal64OrNull, ConvertFromStringExceptionMode::Null>; -using FunctionToDecimal128OrNull = FunctionConvertFromString, NameToDecimal128OrNull, ConvertFromStringExceptionMode::Null>; -using FunctionToDecimal256OrNull = FunctionConvertFromString, NameToDecimal256OrNull, ConvertFromStringExceptionMode::Null>; -using FunctionToUUIDOrNull = FunctionConvertFromString; -using FunctionToIPv4OrNull = FunctionConvertFromString; -using FunctionToIPv6OrNull = FunctionConvertFromString; - -struct NameParseDateTimeBestEffort { static constexpr auto name = "parseDateTimeBestEffort"; }; -struct NameParseDateTimeBestEffortOrZero { static constexpr auto name = "parseDateTimeBestEffortOrZero"; }; -struct NameParseDateTimeBestEffortOrNull { static constexpr auto name = "parseDateTimeBestEffortOrNull"; }; -struct NameParseDateTimeBestEffortUS { static constexpr auto name = "parseDateTimeBestEffortUS"; }; -struct NameParseDateTimeBestEffortUSOrZero { static constexpr auto name = "parseDateTimeBestEffortUSOrZero"; }; -struct NameParseDateTimeBestEffortUSOrNull { static constexpr auto name = "parseDateTimeBestEffortUSOrNull"; }; -struct NameParseDateTime32BestEffort { static constexpr auto name = "parseDateTime32BestEffort"; }; -struct NameParseDateTime32BestEffortOrZero { static constexpr auto name = "parseDateTime32BestEffortOrZero"; }; -struct NameParseDateTime32BestEffortOrNull { static constexpr auto name = "parseDateTime32BestEffortOrNull"; }; -struct NameParseDateTime64BestEffort { static constexpr auto name = "parseDateTime64BestEffort"; }; -struct NameParseDateTime64BestEffortOrZero { static constexpr auto name = "parseDateTime64BestEffortOrZero"; }; -struct NameParseDateTime64BestEffortOrNull { static constexpr auto name = "parseDateTime64BestEffortOrNull"; }; -struct NameParseDateTime64BestEffortUS { static constexpr auto name = "parseDateTime64BestEffortUS"; }; -struct NameParseDateTime64BestEffortUSOrZero { static constexpr auto name = "parseDateTime64BestEffortUSOrZero"; }; -struct NameParseDateTime64BestEffortUSOrNull { static constexpr auto name = "parseDateTime64BestEffortUSOrNull"; }; - - -using FunctionParseDateTimeBestEffort = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTimeBestEffort, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffort>; -using FunctionParseDateTimeBestEffortOrZero = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTimeBestEffortOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffort>; -using FunctionParseDateTimeBestEffortOrNull = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTimeBestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; - -using FunctionParseDateTimeBestEffortUS = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTimeBestEffortUS, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffortUS>; -using FunctionParseDateTimeBestEffortUSOrZero = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTimeBestEffortUSOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffortUS>; -using FunctionParseDateTimeBestEffortUSOrNull = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTimeBestEffortUSOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffortUS>; - -using FunctionParseDateTime32BestEffort = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTime32BestEffort, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffort>; -using FunctionParseDateTime32BestEffortOrZero = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTime32BestEffortOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffort>; -using FunctionParseDateTime32BestEffortOrNull = FunctionConvertFromString< - DataTypeDateTime, NameParseDateTime32BestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; - -using FunctionParseDateTime64BestEffort = FunctionConvertFromString< - DataTypeDateTime64, NameParseDateTime64BestEffort, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffort>; -using FunctionParseDateTime64BestEffortOrZero = FunctionConvertFromString< - DataTypeDateTime64, NameParseDateTime64BestEffortOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffort>; -using FunctionParseDateTime64BestEffortOrNull = FunctionConvertFromString< - DataTypeDateTime64, NameParseDateTime64BestEffortOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffort>; - -using FunctionParseDateTime64BestEffortUS = FunctionConvertFromString< - DataTypeDateTime64, NameParseDateTime64BestEffortUS, ConvertFromStringExceptionMode::Throw, ConvertFromStringParsingMode::BestEffortUS>; -using FunctionParseDateTime64BestEffortUSOrZero = FunctionConvertFromString< - DataTypeDateTime64, NameParseDateTime64BestEffortUSOrZero, ConvertFromStringExceptionMode::Zero, ConvertFromStringParsingMode::BestEffortUS>; -using FunctionParseDateTime64BestEffortUSOrNull = FunctionConvertFromString< - DataTypeDateTime64, NameParseDateTime64BestEffortUSOrNull, ConvertFromStringExceptionMode::Null, ConvertFromStringParsingMode::BestEffortUS>; - - -class ExecutableFunctionCast : public IExecutableFunction -{ -public: - using WrapperType = std::function; - - explicit ExecutableFunctionCast( - WrapperType && wrapper_function_, const char * name_, std::optional diagnostic_) - : wrapper_function(std::move(wrapper_function_)), name(name_), diagnostic(std::move(diagnostic_)) {} - - String getName() const override { return name; } - -protected: - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override - { - /// drop second argument, pass others - ColumnsWithTypeAndName new_arguments{arguments.front()}; - if (arguments.size() > 2) - new_arguments.insert(std::end(new_arguments), std::next(std::begin(arguments), 2), std::end(arguments)); - - try - { - return wrapper_function(new_arguments, result_type, nullptr, input_rows_count); - } - catch (Exception & e) - { - if (diagnostic) - e.addMessage("while converting source column " + backQuoteIfNeed(diagnostic->column_from) + - " to destination column " + backQuoteIfNeed(diagnostic->column_to)); - throw; - } - } - - bool useDefaultImplementationForNulls() const override { return false; } - /// CAST(Nothing, T) -> T - bool useDefaultImplementationForNothing() const override { return false; } - bool useDefaultImplementationForConstants() const override { return true; } - bool useDefaultImplementationForLowCardinalityColumns() const override { return false; } - ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - -private: - WrapperType wrapper_function; - const char * name; - std::optional diagnostic; -}; - - -struct FunctionCastName -{ - static constexpr auto name = "CAST"; -}; - -class FunctionCastBase : public IFunctionBase -{ -public: - using MonotonicityForRange = std::function; -}; - -class FunctionCast final : public FunctionCastBase -{ -public: - using WrapperType = std::function; - - FunctionCast(ContextPtr context_ - , const char * cast_name_ - , MonotonicityForRange && monotonicity_for_range_ - , const DataTypes & argument_types_ - , const DataTypePtr & return_type_ - , std::optional diagnostic_ - , CastType cast_type_) - : cast_name(cast_name_), monotonicity_for_range(std::move(monotonicity_for_range_)) - , argument_types(argument_types_), return_type(return_type_), diagnostic(std::move(diagnostic_)) - , cast_type(cast_type_) - , context(context_) - { - } - - const DataTypes & getArgumentTypes() const override { return argument_types; } - const DataTypePtr & getResultType() const override { return return_type; } - - ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName & /*sample_columns*/) const override - { - try - { - return std::make_unique( - prepareUnpackDictionaries(getArgumentTypes()[0], getResultType()), cast_name, diagnostic); - } - catch (Exception & e) - { - if (diagnostic) - e.addMessage("while converting source column " + backQuoteIfNeed(diagnostic->column_from) + - " to destination column " + backQuoteIfNeed(diagnostic->column_to)); - throw; - } - } - - String getName() const override { return cast_name; } - - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - - bool hasInformationAboutMonotonicity() const override - { - return static_cast(monotonicity_for_range); - } - - Monotonicity getMonotonicityForRange(const IDataType & type, const Field & left, const Field & right) const override - { - return monotonicity_for_range(type, left, right); - } - -private: - - const char * cast_name; - MonotonicityForRange monotonicity_for_range; - - DataTypes argument_types; - DataTypePtr return_type; - - std::optional diagnostic; - CastType cast_type; - ContextPtr context; - - static WrapperType createFunctionAdaptor(FunctionPtr function, const DataTypePtr & from_type) - { - auto function_adaptor = std::make_unique(function)->build({ColumnWithTypeAndName{nullptr, from_type, ""}}); - - return [function_adaptor] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) - { - return function_adaptor->execute(arguments, result_type, input_rows_count); - }; - } - - static WrapperType createToNullableColumnWrapper() - { - return [] (ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) - { - ColumnPtr res = result_type->createColumn(); - ColumnUInt8::Ptr col_null_map_to = ColumnUInt8::create(input_rows_count, true); - return ColumnNullable::create(res->cloneResized(input_rows_count), std::move(col_null_map_to)); - }; - } - - template - WrapperType createWrapper(const DataTypePtr & from_type, const ToDataType * const to_type, bool requested_result_is_nullable) const - { - TypeIndex from_type_index = from_type->getTypeId(); - WhichDataType which(from_type_index); - bool can_apply_accurate_cast = (cast_type == CastType::accurate || cast_type == CastType::accurateOrNull) - && (which.isInt() || which.isUInt() || which.isFloat()); - - FormatSettings::DateTimeOverflowBehavior date_time_overflow_behavior = default_date_time_overflow_behavior; - if (context) - date_time_overflow_behavior = context->getSettingsRef().date_time_overflow_behavior; - - if (requested_result_is_nullable && checkAndGetDataType(from_type.get())) - { - /// In case when converting to Nullable type, we apply different parsing rule, - /// that will not throw an exception but return NULL in case of malformed input. - FunctionPtr function = FunctionConvertFromString::create(); - return createFunctionAdaptor(function, from_type); - } - else if (!can_apply_accurate_cast) - { - FunctionPtr function = FunctionTo::Type::create(context); - return createFunctionAdaptor(function, from_type); - } - - auto wrapper_cast_type = cast_type; - - return [wrapper_cast_type, from_type_index, to_type, date_time_overflow_behavior] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *column_nullable, size_t input_rows_count) - { - ColumnPtr result_column; - auto res = callOnIndexAndDataType(from_type_index, [&](const auto & types) -> bool { - using Types = std::decay_t; - using LeftDataType = typename Types::LeftType; - using RightDataType = typename Types::RightType; - - if constexpr (IsDataTypeNumber) - { - if constexpr (IsDataTypeNumber) - { -#define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE, ADDITIONS) \ - case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ - result_column = ConvertImpl::execute( \ - arguments, result_type, input_rows_count, ADDITIONS()); \ - break; - if (wrapper_cast_type == CastType::accurate) - { - switch (date_time_overflow_behavior) - { - GENERATE_OVERFLOW_MODE_CASE(Throw, AccurateConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Ignore, AccurateConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Saturate, AccurateConvertStrategyAdditions) - } - } - else - { - switch (date_time_overflow_behavior) - { - GENERATE_OVERFLOW_MODE_CASE(Throw, AccurateOrNullConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Ignore, AccurateOrNullConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Saturate, AccurateOrNullConvertStrategyAdditions) - } - } -#undef GENERATE_OVERFLOW_MODE_CASE - - return true; - } - - if constexpr (std::is_same_v || std::is_same_v) - { -#define GENERATE_OVERFLOW_MODE_CASE(OVERFLOW_MODE, ADDITIONS) \ - case FormatSettings::DateTimeOverflowBehavior::OVERFLOW_MODE: \ - result_column = ConvertImpl::template execute( \ -arguments, result_type, input_rows_count); \ - break; - if (wrapper_cast_type == CastType::accurate) - { - switch (date_time_overflow_behavior) - { - GENERATE_OVERFLOW_MODE_CASE(Throw, DateTimeAccurateConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Ignore, DateTimeAccurateConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Saturate, DateTimeAccurateConvertStrategyAdditions) - } - } - else - { - switch (date_time_overflow_behavior) - { - GENERATE_OVERFLOW_MODE_CASE(Throw, DateTimeAccurateOrNullConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Ignore, DateTimeAccurateOrNullConvertStrategyAdditions) - GENERATE_OVERFLOW_MODE_CASE(Saturate, DateTimeAccurateOrNullConvertStrategyAdditions) - } - } -#undef GENERATE_OVERFLOW_MODE_CASE - return true; - } - } - - return false; - }); - - /// Additionally check if callOnIndexAndDataType wasn't called at all. - if (!res) - { - if (wrapper_cast_type == CastType::accurateOrNull) - { - auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); - return nullable_column_wrapper(arguments, result_type, column_nullable, input_rows_count); - } - else - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, - "Conversion from {} to {} is not supported", - from_type_index, to_type->getName()); - } - } - - return result_column; - }; - } - - template - WrapperType createBoolWrapper(const DataTypePtr & from_type, const ToDataType * const to_type, bool requested_result_is_nullable) const - { - if (checkAndGetDataType(from_type.get())) - { - return &ConvertImplGenericFromString::execute; - } - - return createWrapper(from_type, to_type, requested_result_is_nullable); - } - - WrapperType createUInt8ToBoolWrapper(const DataTypePtr from_type, const DataTypePtr to_type) const - { - return [from_type, to_type] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) -> ColumnPtr - { - /// Special case when we convert UInt8 column to Bool column. - /// both columns have type UInt8, but we shouldn't use identity wrapper, - /// because Bool column can contain only 0 and 1. - auto res_column = to_type->createColumn(); - const auto & data_from = checkAndGetColumn(arguments[0].column.get())->getData(); - auto & data_to = assert_cast(res_column.get())->getData(); - data_to.resize(data_from.size()); - for (size_t i = 0; i != data_from.size(); ++i) - data_to[i] = static_cast(data_from[i]); - return res_column; - }; - } - - static WrapperType createStringWrapper(const DataTypePtr & from_type) - { - FunctionPtr function = FunctionToString::create(); - return createFunctionAdaptor(function, from_type); - } - - WrapperType createFixedStringWrapper(const DataTypePtr & from_type, const size_t N) const - { - if (!isStringOrFixedString(from_type)) - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "CAST AS FixedString is only implemented for types String and FixedString"); - - bool exception_mode_null = cast_type == CastType::accurateOrNull; - return [exception_mode_null, N] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) - { - if (exception_mode_null) - return FunctionToFixedString::executeForN(arguments, N); - else - return FunctionToFixedString::executeForN(arguments, N); - }; - } - -#define GENERATE_INTERVAL_CASE(INTERVAL_KIND) \ - case IntervalKind::Kind::INTERVAL_KIND: \ - return createFunctionAdaptor(FunctionConvert::create(), from_type); - - static WrapperType createIntervalWrapper(const DataTypePtr & from_type, IntervalKind kind) - { - switch (kind) - { - GENERATE_INTERVAL_CASE(Nanosecond) - GENERATE_INTERVAL_CASE(Microsecond) - GENERATE_INTERVAL_CASE(Millisecond) - GENERATE_INTERVAL_CASE(Second) - GENERATE_INTERVAL_CASE(Minute) - GENERATE_INTERVAL_CASE(Hour) - GENERATE_INTERVAL_CASE(Day) - GENERATE_INTERVAL_CASE(Week) - GENERATE_INTERVAL_CASE(Month) - GENERATE_INTERVAL_CASE(Quarter) - GENERATE_INTERVAL_CASE(Year) - } - throw Exception{ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion to unexpected IntervalKind: {}", kind.toString()}; - } - -#undef GENERATE_INTERVAL_CASE - - template - requires IsDataTypeDecimal - WrapperType createDecimalWrapper(const DataTypePtr & from_type, const ToDataType * to_type, bool requested_result_is_nullable) const - { - TypeIndex type_index = from_type->getTypeId(); - UInt32 scale = to_type->getScale(); - - WhichDataType which(type_index); - bool ok = which.isNativeInt() || which.isNativeUInt() || which.isDecimal() || which.isFloat() || which.isDateOrDate32() || which.isDateTime() || which.isDateTime64() - || which.isStringOrFixedString(); - if (!ok) - { - if (cast_type == CastType::accurateOrNull) - return createToNullableColumnWrapper(); - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", - from_type->getName(), to_type->getName()); - } - - auto wrapper_cast_type = cast_type; - - return [wrapper_cast_type, type_index, scale, to_type, requested_result_is_nullable] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *column_nullable, size_t input_rows_count) - { - ColumnPtr result_column; - auto res = callOnIndexAndDataType(type_index, [&](const auto & types) -> bool - { - using Types = std::decay_t; - using LeftDataType = typename Types::LeftType; - using RightDataType = typename Types::RightType; - - if constexpr (IsDataTypeDecimalOrNumber && IsDataTypeDecimalOrNumber && !std::is_same_v) - { - if (wrapper_cast_type == CastType::accurate) - { - AccurateConvertStrategyAdditions additions; - additions.scale = scale; - result_column = ConvertImpl::execute( - arguments, result_type, input_rows_count, additions); - - return true; - } - else if (wrapper_cast_type == CastType::accurateOrNull) - { - AccurateOrNullConvertStrategyAdditions additions; - additions.scale = scale; - result_column = ConvertImpl::execute( - arguments, result_type, input_rows_count, additions); - - return true; - } - } - else if constexpr (std::is_same_v) - { - if (requested_result_is_nullable) - { - /// Consistent with CAST(Nullable(String) AS Nullable(Numbers)) - /// In case when converting to Nullable type, we apply different parsing rule, - /// that will not throw an exception but return NULL in case of malformed input. - result_column = ConvertImpl::execute( - arguments, result_type, input_rows_count, scale); - - return true; - } - } - - result_column = ConvertImpl::execute(arguments, result_type, input_rows_count, scale); - - return true; - }); - - /// Additionally check if callOnIndexAndDataType wasn't called at all. - if (!res) - { - if (wrapper_cast_type == CastType::accurateOrNull) - { - auto nullable_column_wrapper = FunctionCast::createToNullableColumnWrapper(); - return nullable_column_wrapper(arguments, result_type, column_nullable, input_rows_count); - } - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, - "Conversion from {} to {} is not supported", - type_index, to_type->getName()); - } - - return result_column; - }; - } - - WrapperType createAggregateFunctionWrapper(const DataTypePtr & from_type_untyped, const DataTypeAggregateFunction * to_type) const - { - /// Conversion from String through parsing. - if (checkAndGetDataType(from_type_untyped.get())) - { - return &ConvertImplGenericFromString::execute; - } - else if (const auto * agg_type = checkAndGetDataType(from_type_untyped.get())) - { - if (agg_type->getFunction()->haveSameStateRepresentation(*to_type->getFunction())) - { - return [function = to_type->getFunction()]( - ColumnsWithTypeAndName & arguments, - const DataTypePtr & /* result_type */, - const ColumnNullable * /* nullable_source */, - size_t /*input_rows_count*/) -> ColumnPtr - { - const auto & argument_column = arguments.front(); - const auto * col_agg = checkAndGetColumn(argument_column.column.get()); - if (col_agg) - { - auto new_col_agg = ColumnAggregateFunction::create(*col_agg); - new_col_agg->set(function); - return new_col_agg; - } - else - { - throw Exception( - ErrorCodes::LOGICAL_ERROR, - "Illegal column {} for function CAST AS AggregateFunction", - argument_column.column->getName()); - } - }; - } - } - - if (cast_type == CastType::accurateOrNull) - return createToNullableColumnWrapper(); - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", - from_type_untyped->getName(), to_type->getName()); - } - - WrapperType createArrayWrapper(const DataTypePtr & from_type_untyped, const DataTypeArray & to_type) const - { - /// Conversion from String through parsing. - if (checkAndGetDataType(from_type_untyped.get())) - { - return &ConvertImplGenericFromString::execute; - } - - DataTypePtr from_type_holder; - const auto * from_type = checkAndGetDataType(from_type_untyped.get()); - const auto * from_type_map = checkAndGetDataType(from_type_untyped.get()); - - /// Convert from Map - if (from_type_map) - { - /// Recreate array of unnamed tuples because otherwise it may work - /// unexpectedly while converting to array of named tuples. - from_type_holder = from_type_map->getNestedTypeWithUnnamedTuple(); - from_type = assert_cast(from_type_holder.get()); - } - - if (!from_type) - { - throw Exception(ErrorCodes::TYPE_MISMATCH, - "CAST AS Array can only be performed between same-dimensional Array, Map or String types"); - } - - DataTypePtr from_nested_type = from_type->getNestedType(); - - /// In query SELECT CAST([] AS Array(Array(String))) from type is Array(Nothing) - bool from_empty_array = isNothing(from_nested_type); - - if (from_type->getNumberOfDimensions() != to_type.getNumberOfDimensions() && !from_empty_array) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "CAST AS Array can only be performed between same-dimensional array types"); - - const DataTypePtr & to_nested_type = to_type.getNestedType(); - - /// Prepare nested type conversion - const auto nested_function = prepareUnpackDictionaries(from_nested_type, to_nested_type); - - return [nested_function, from_nested_type, to_nested_type]( - ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr - { - const auto & argument_column = arguments.front(); - - const ColumnArray * col_array = nullptr; - - if (const ColumnMap * col_map = checkAndGetColumn(argument_column.column.get())) - col_array = &col_map->getNestedColumn(); - else - col_array = checkAndGetColumn(argument_column.column.get()); - - if (col_array) - { - /// create columns for converting nested column containing original and result columns - ColumnsWithTypeAndName nested_columns{{ col_array->getDataPtr(), from_nested_type, "" }}; - - /// convert nested column - auto result_column = nested_function(nested_columns, to_nested_type, nullable_source, nested_columns.front().column->size()); - - /// set converted nested column to result - return ColumnArray::create(result_column, col_array->getOffsetsPtr()); - } - else - { - throw Exception(ErrorCodes::LOGICAL_ERROR, - "Illegal column {} for function CAST AS Array", - argument_column.column->getName()); - } - }; - } - - using ElementWrappers = std::vector; - - ElementWrappers getElementWrappers(const DataTypes & from_element_types, const DataTypes & to_element_types) const - { - ElementWrappers element_wrappers; - element_wrappers.reserve(from_element_types.size()); - - /// Create conversion wrapper for each element in tuple - for (size_t i = 0; i < from_element_types.size(); ++i) - { - const DataTypePtr & from_element_type = from_element_types[i]; - const DataTypePtr & to_element_type = to_element_types[i]; - element_wrappers.push_back(prepareUnpackDictionaries(from_element_type, to_element_type)); - } - - return element_wrappers; - } - - WrapperType createTupleWrapper(const DataTypePtr & from_type_untyped, const DataTypeTuple * to_type) const - { - /// Conversion from String through parsing. - if (checkAndGetDataType(from_type_untyped.get())) - { - return &ConvertImplGenericFromString::execute; - } - - const auto * from_type = checkAndGetDataType(from_type_untyped.get()); - if (!from_type) - throw Exception(ErrorCodes::TYPE_MISMATCH, "CAST AS Tuple can only be performed between tuple types or from String.\n" - "Left type: {}, right type: {}", from_type_untyped->getName(), to_type->getName()); - - const auto & from_element_types = from_type->getElements(); - const auto & to_element_types = to_type->getElements(); - - std::vector element_wrappers; - std::vector> to_reverse_index; - - /// For named tuples allow conversions for tuples with - /// different sets of elements. If element exists in @to_type - /// and doesn't exist in @to_type it will be filled by default values. - if (from_type->haveExplicitNames() && to_type->haveExplicitNames()) - { - const auto & from_names = from_type->getElementNames(); - std::unordered_map from_positions; - from_positions.reserve(from_names.size()); - for (size_t i = 0; i < from_names.size(); ++i) - from_positions[from_names[i]] = i; - - const auto & to_names = to_type->getElementNames(); - element_wrappers.reserve(to_names.size()); - to_reverse_index.reserve(from_names.size()); - - for (size_t i = 0; i < to_names.size(); ++i) - { - auto it = from_positions.find(to_names[i]); - if (it != from_positions.end()) - { - element_wrappers.emplace_back(prepareUnpackDictionaries(from_element_types[it->second], to_element_types[i])); - to_reverse_index.emplace_back(it->second); - } - else - { - element_wrappers.emplace_back(); - to_reverse_index.emplace_back(); - } - } - } - else - { - if (from_element_types.size() != to_element_types.size()) - throw Exception(ErrorCodes::TYPE_MISMATCH, "CAST AS Tuple can only be performed between tuple types " - "with the same number of elements or from String.\nLeft type: {}, right type: {}", - from_type->getName(), to_type->getName()); - - element_wrappers = getElementWrappers(from_element_types, to_element_types); - to_reverse_index.reserve(to_element_types.size()); - for (size_t i = 0; i < to_element_types.size(); ++i) - to_reverse_index.emplace_back(i); - } - - return [element_wrappers, from_element_types, to_element_types, to_reverse_index] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) -> ColumnPtr - { - const auto * col = arguments.front().column.get(); - - size_t tuple_size = to_element_types.size(); - const ColumnTuple & column_tuple = typeid_cast(*col); - - Columns converted_columns(tuple_size); - - /// invoke conversion for each element - for (size_t i = 0; i < tuple_size; ++i) - { - if (to_reverse_index[i]) - { - size_t from_idx = *to_reverse_index[i]; - ColumnsWithTypeAndName element = {{column_tuple.getColumns()[from_idx], from_element_types[from_idx], "" }}; - converted_columns[i] = element_wrappers[i](element, to_element_types[i], nullable_source, input_rows_count); - } - else - { - converted_columns[i] = to_element_types[i]->createColumn()->cloneResized(input_rows_count); - } - } - - return ColumnTuple::create(converted_columns); - }; - } - - /// The case of: tuple([key1, key2, ..., key_n], [value1, value2, ..., value_n]) - WrapperType createTupleToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const - { - return [element_wrappers = getElementWrappers(from_kv_types, to_kv_types), from_kv_types, to_kv_types] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr - { - const auto * col = arguments.front().column.get(); - const auto & column_tuple = assert_cast(*col); - - Columns offsets(2); - Columns converted_columns(2); - for (size_t i = 0; i < 2; ++i) - { - const auto & column_array = assert_cast(column_tuple.getColumn(i)); - ColumnsWithTypeAndName element = {{column_array.getDataPtr(), from_kv_types[i], ""}}; - converted_columns[i] = element_wrappers[i](element, to_kv_types[i], nullable_source, (element[0].column)->size()); - offsets[i] = column_array.getOffsetsPtr(); - } - - const auto & keys_offsets = assert_cast(*offsets[0]).getData(); - const auto & values_offsets = assert_cast(*offsets[1]).getData(); - if (keys_offsets != values_offsets) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "CAST AS Map can only be performed from tuple of arrays with equal sizes."); - - return ColumnMap::create(converted_columns[0], converted_columns[1], offsets[0]); - }; - } - - WrapperType createMapToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const - { - return [element_wrappers = getElementWrappers(from_kv_types, to_kv_types), from_kv_types, to_kv_types] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr - { - const auto * col = arguments.front().column.get(); - const auto & column_map = typeid_cast(*col); - const auto & nested_data = column_map.getNestedData(); - - Columns converted_columns(2); - for (size_t i = 0; i < 2; ++i) - { - ColumnsWithTypeAndName element = {{nested_data.getColumnPtr(i), from_kv_types[i], ""}}; - converted_columns[i] = element_wrappers[i](element, to_kv_types[i], nullable_source, (element[0].column)->size()); - } - - return ColumnMap::create(converted_columns[0], converted_columns[1], column_map.getNestedColumn().getOffsetsPtr()); - }; - } - - /// The case of: [(key1, value1), (key2, value2), ...] - WrapperType createArrayToMapWrapper(const DataTypes & from_kv_types, const DataTypes & to_kv_types) const - { - return [element_wrappers = getElementWrappers(from_kv_types, to_kv_types), from_kv_types, to_kv_types] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t /*input_rows_count*/) -> ColumnPtr - { - const auto * col = arguments.front().column.get(); - const auto & column_array = typeid_cast(*col); - const auto & nested_data = typeid_cast(column_array.getData()); - - Columns converted_columns(2); - for (size_t i = 0; i < 2; ++i) - { - ColumnsWithTypeAndName element = {{nested_data.getColumnPtr(i), from_kv_types[i], ""}}; - converted_columns[i] = element_wrappers[i](element, to_kv_types[i], nullable_source, (element[0].column)->size()); - } - - return ColumnMap::create(converted_columns[0], converted_columns[1], column_array.getOffsetsPtr()); - }; - } - - - WrapperType createMapWrapper(const DataTypePtr & from_type_untyped, const DataTypeMap * to_type) const - { - if (const auto * from_tuple = checkAndGetDataType(from_type_untyped.get())) - { - if (from_tuple->getElements().size() != 2) - throw Exception( - ErrorCodes::TYPE_MISMATCH, - "CAST AS Map from tuple requires 2 elements. " - "Left type: {}, right type: {}", - from_tuple->getName(), - to_type->getName()); - - DataTypes from_kv_types; - const auto & to_kv_types = to_type->getKeyValueTypes(); - - for (const auto & elem : from_tuple->getElements()) - { - const auto * type_array = checkAndGetDataType(elem.get()); - if (!type_array) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "CAST AS Map can only be performed from tuples of array. Got: {}", from_tuple->getName()); - - from_kv_types.push_back(type_array->getNestedType()); - } - - return createTupleToMapWrapper(from_kv_types, to_kv_types); - } - else if (const auto * from_array = typeid_cast(from_type_untyped.get())) - { - const auto * nested_tuple = typeid_cast(from_array->getNestedType().get()); - if (!nested_tuple || nested_tuple->getElements().size() != 2) - throw Exception( - ErrorCodes::TYPE_MISMATCH, - "CAST AS Map from array requires nested tuple of 2 elements. " - "Left type: {}, right type: {}", - from_array->getName(), - to_type->getName()); - - return createArrayToMapWrapper(nested_tuple->getElements(), to_type->getKeyValueTypes()); - } - else if (const auto * from_type = checkAndGetDataType(from_type_untyped.get())) - { - return createMapToMapWrapper(from_type->getKeyValueTypes(), to_type->getKeyValueTypes()); - } - else - { - throw Exception(ErrorCodes::TYPE_MISMATCH, "Unsupported types to CAST AS Map. " - "Left type: {}, right type: {}", from_type_untyped->getName(), to_type->getName()); - } - } - - WrapperType createTupleToObjectWrapper(const DataTypeTuple & from_tuple, bool has_nullable_subcolumns) const - { - if (!from_tuple.haveExplicitNames()) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten Named Tuple. Got: {}", from_tuple.getName()); - - PathsInData paths; - DataTypes from_types; - - std::tie(paths, from_types) = flattenTuple(from_tuple.getPtr()); - auto to_types = from_types; - - for (auto & type : to_types) - { - if (isTuple(type) || isNested(type)) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten Named Tuple. Got: {}", - from_tuple.getName()); - - type = recursiveRemoveLowCardinality(type); - } - - return [element_wrappers = getElementWrappers(from_types, to_types), - has_nullable_subcolumns, from_types, to_types, paths] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t input_rows_count) - { - size_t tuple_size = to_types.size(); - auto flattened_column = flattenTuple(arguments.front().column); - const auto & column_tuple = assert_cast(*flattened_column); - - if (tuple_size != column_tuple.getColumns().size()) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Expected tuple with {} subcolumn, but got {} subcolumns", - tuple_size, column_tuple.getColumns().size()); - - auto res = ColumnObject::create(has_nullable_subcolumns); - for (size_t i = 0; i < tuple_size; ++i) - { - ColumnsWithTypeAndName element = {{column_tuple.getColumns()[i], from_types[i], "" }}; - auto converted_column = element_wrappers[i](element, to_types[i], nullable_source, input_rows_count); - res->addSubcolumn(paths[i], converted_column->assumeMutable()); - } - - return res; - }; - } - - WrapperType createMapToObjectWrapper(const DataTypeMap & from_map, bool has_nullable_subcolumns) const - { - auto key_value_types = from_map.getKeyValueTypes(); - - if (!isStringOrFixedString(key_value_types[0])) - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object from Map can be performed only from Map " - "with String or FixedString key. Got: {}", from_map.getName()); - - const auto & value_type = key_value_types[1]; - auto to_value_type = value_type; - - if (!has_nullable_subcolumns && value_type->isNullable()) - to_value_type = removeNullable(value_type); - - if (has_nullable_subcolumns && !value_type->isNullable()) - to_value_type = makeNullable(value_type); - - DataTypes to_key_value_types{std::make_shared(), std::move(to_value_type)}; - auto element_wrappers = getElementWrappers(key_value_types, to_key_value_types); - - return [has_nullable_subcolumns, element_wrappers, key_value_types, to_key_value_types] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable * nullable_source, size_t) -> ColumnPtr - { - const auto & column_map = assert_cast(*arguments.front().column); - const auto & offsets = column_map.getNestedColumn().getOffsets(); - auto key_value_columns = column_map.getNestedData().getColumnsCopy(); - - for (size_t i = 0; i < 2; ++i) - { - ColumnsWithTypeAndName element{{key_value_columns[i], key_value_types[i], ""}}; - key_value_columns[i] = element_wrappers[i](element, to_key_value_types[i], nullable_source, key_value_columns[i]->size()); - } - - const auto & key_column_str = assert_cast(*key_value_columns[0]); - const auto & value_column = *key_value_columns[1]; - - using SubcolumnsMap = HashMap; - SubcolumnsMap subcolumns; - - for (size_t row = 0; row < offsets.size(); ++row) - { - for (size_t i = offsets[static_cast(row) - 1]; i < offsets[row]; ++i) - { - auto ref = key_column_str.getDataAt(i); - - bool inserted; - SubcolumnsMap::LookupResult it; - subcolumns.emplace(ref, it, inserted); - auto & subcolumn = it->getMapped(); - - if (inserted) - subcolumn = value_column.cloneEmpty()->cloneResized(row); - - /// Map can have duplicated keys. We insert only first one. - if (subcolumn->size() == row) - subcolumn->insertFrom(value_column, i); - } - - /// Insert default values for keys missed in current row. - for (const auto & [_, subcolumn] : subcolumns) - if (subcolumn->size() == row) - subcolumn->insertDefault(); - } - - auto column_object = ColumnObject::create(has_nullable_subcolumns); - for (auto && [key, subcolumn] : subcolumns) - { - PathInData path(key.toView()); - column_object->addSubcolumn(path, std::move(subcolumn)); - } - - return column_object; - }; - } - - WrapperType createObjectWrapper(const DataTypePtr & from_type, const DataTypeObject * to_type) const - { - if (const auto * from_tuple = checkAndGetDataType(from_type.get())) - { - return createTupleToObjectWrapper(*from_tuple, to_type->hasNullableSubcolumns()); - } - else if (const auto * from_map = checkAndGetDataType(from_type.get())) - { - return createMapToObjectWrapper(*from_map, to_type->hasNullableSubcolumns()); - } - else if (checkAndGetDataType(from_type.get())) - { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) - { - auto res = ConvertImplGenericFromString::execute(arguments, result_type, nullable_source, input_rows_count)->assumeMutable(); - res->finalize(); - return res; - }; - } - else if (checkAndGetDataType(from_type.get())) - { - return [is_nullable = to_type->hasNullableSubcolumns()] (ColumnsWithTypeAndName & arguments, const DataTypePtr & , const ColumnNullable * , size_t) -> ColumnPtr - { - auto & column_object = assert_cast(*arguments.front().column); - auto res = ColumnObject::create(is_nullable); - for (size_t i = 0; i < column_object.size(); i++) - res->insert(column_object[i]); - - res->finalize(); - return res; - }; - } - - throw Exception(ErrorCodes::TYPE_MISMATCH, - "Cast to Object can be performed only from flatten named Tuple, Map or String. Got: {}", from_type->getName()); - } - - WrapperType createVariantToVariantWrapper(const DataTypeVariant & from_variant, const DataTypeVariant & to_variant) const - { - /// We support only extension of variant type, so, only new types can be added. - /// For example: Variant(T1, T2) -> Variant(T1, T2, T3) is supported, but Variant(T1, T2) -> Variant(T1, T3) is not supported. - /// We want to extend Variant type for free without rewriting the data, but we sort data types inside Variant during type creation - /// (we do it because we want Variant(T1, T2) to be the same as Variant(T2, T1)), but after extension the order of variant types - /// (and so their discriminators) can be different. For example: Variant(T1, T3) -> Variant(T1, T2, T3). - /// To avoid full rewrite of discriminators column, ColumnVariant supports it's local order of variant columns (and so local - /// discriminators) and stores mapping global order -> local order. - /// So, to extend Variant with new types for free, we should keep old local order for old variants, append new variants and change - /// mapping global order -> local order according to the new global order. - - /// Create map (new variant type) -> (it's global discriminator in new order). - const auto & new_variants = to_variant.getVariants(); - std::unordered_map new_variant_types_to_new_global_discriminator; - new_variant_types_to_new_global_discriminator.reserve(new_variants.size()); - for (size_t i = 0; i != new_variants.size(); ++i) - new_variant_types_to_new_global_discriminator[new_variants[i]->getName()] = i; - - /// Create set of old variant types. - const auto & old_variants = from_variant.getVariants(); - std::unordered_map old_variant_types_to_old_global_discriminator; - old_variant_types_to_old_global_discriminator.reserve(old_variants.size()); - for (size_t i = 0; i != old_variants.size(); ++i) - old_variant_types_to_old_global_discriminator[old_variants[i]->getName()] = i; - - /// Check that the set of old variants types is a subset of new variant types and collect new global discriminator for each old global discriminator. - std::unordered_map old_global_discriminator_to_new; - old_global_discriminator_to_new.reserve(old_variants.size()); - for (const auto & [old_variant_type, old_discriminator] : old_variant_types_to_old_global_discriminator) - { - auto it = new_variant_types_to_new_global_discriminator.find(old_variant_type); - if (it == new_variant_types_to_new_global_discriminator.end()) - throw Exception( - ErrorCodes::CANNOT_CONVERT_TYPE, - "Cannot convert type {} to {}. Conversion between Variant types is allowed only when new Variant type is an extension " - "of an initial one", from_variant.getName(), to_variant.getName()); - old_global_discriminator_to_new[old_discriminator] = it->second; - } - - /// Collect variant types and their global discriminators that should be added to the old Variant to get the new Variant. - std::vector> variant_types_and_discriminators_to_add; - variant_types_and_discriminators_to_add.reserve(new_variants.size() - old_variants.size()); - for (size_t i = 0; i != new_variants.size(); ++i) - { - if (!old_variant_types_to_old_global_discriminator.contains(new_variants[i]->getName())) - variant_types_and_discriminators_to_add.emplace_back(new_variants[i], i); - } - - return [old_global_discriminator_to_new, variant_types_and_discriminators_to_add] - (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t) -> ColumnPtr - { - const auto & column_variant = assert_cast(*arguments.front().column.get()); - size_t num_old_variants = column_variant.getNumVariants(); - Columns new_variant_columns; - new_variant_columns.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); - std::vector new_local_to_global_discriminators; - new_local_to_global_discriminators.reserve(num_old_variants + variant_types_and_discriminators_to_add.size()); - for (size_t i = 0; i != num_old_variants; ++i) - { - new_variant_columns.push_back(column_variant.getVariantPtrByLocalDiscriminator(i)); - new_local_to_global_discriminators.push_back(old_global_discriminator_to_new.at(column_variant.globalDiscriminatorByLocal(i))); - } - - for (const auto & [new_variant_type, new_global_discriminator] : variant_types_and_discriminators_to_add) - { - new_variant_columns.push_back(new_variant_type->createColumn()); - new_local_to_global_discriminators.push_back(new_global_discriminator); - } - - return ColumnVariant::create(column_variant.getLocalDiscriminatorsPtr(), column_variant.getOffsetsPtr(), new_variant_columns, new_local_to_global_discriminators); - }; - } - - WrapperType createVariantToColumnWrapper(const DataTypeVariant & from_variant, const DataTypePtr & to_type) const - { - const auto & variant_types = from_variant.getVariants(); - std::vector variant_wrappers; - variant_wrappers.reserve(variant_types.size()); - - /// Create conversion wrapper for each variant. - for (const auto & variant_type : variant_types) - variant_wrappers.push_back(prepareUnpackDictionaries(variant_type, to_type)); - - return [variant_wrappers, variant_types, to_type] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr - { - const auto & column_variant = assert_cast(*arguments.front().column.get()); - - /// First, cast each variant to the result type. - std::vector casted_variant_columns; - casted_variant_columns.reserve(variant_types.size()); - for (size_t i = 0; i != variant_types.size(); ++i) - { - auto variant_col = column_variant.getVariantPtrByLocalDiscriminator(i); - ColumnsWithTypeAndName variant = {{variant_col, variant_types[i], "" }}; - const auto & variant_wrapper = variant_wrappers[column_variant.globalDiscriminatorByLocal(i)]; - casted_variant_columns.push_back(variant_wrapper(variant, result_type, nullptr, variant_col->size())); - } - - /// Second, construct resulting column from casted variant columns according to discriminators. - const auto & local_discriminators = column_variant.getLocalDiscriminators(); - auto res = result_type->createColumn(); - res->reserve(input_rows_count); - for (size_t i = 0; i != input_rows_count; ++i) - { - auto local_discr = local_discriminators[i]; - if (local_discr == ColumnVariant::NULL_DISCRIMINATOR) - res->insertDefault(); - else - res->insertFrom(*casted_variant_columns[local_discr], column_variant.offsetAt(i)); - } - - return res; - }; - } - - static ColumnPtr createVariantFromDescriptorsAndOneNonEmptyVariant(const DataTypes & variant_types, const ColumnPtr & discriminators, const ColumnPtr & variant, ColumnVariant::Discriminator variant_discr) - { - Columns variants; - variants.reserve(variant_types.size()); - for (size_t i = 0; i != variant_types.size(); ++i) - { - if (i == variant_discr) - variants.emplace_back(variant); - else - variants.push_back(variant_types[i]->createColumn()); - } - - return ColumnVariant::create(discriminators, variants); - } - - WrapperType createColumnToVariantWrapper(const DataTypePtr & from_type, const DataTypeVariant & to_variant) const - { - /// We allow converting NULL to Variant(...) as Variant can store NULLs. - if (from_type->onlyNull()) - { - return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr - { - auto result_column = result_type->createColumn(); - result_column->insertManyDefaults(input_rows_count); - return result_column; - }; - } - - auto variant_discr_opt = to_variant.tryGetVariantDiscriminator(*removeNullableOrLowCardinalityNullable(from_type)); - if (!variant_discr_opt) - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert type {} to {}. Conversion to Variant allowed only for types from this Variant", from_type->getName(), to_variant.getName()); - - return [variant_discr = *variant_discr_opt] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t) -> ColumnPtr - { - const auto & result_variant_type = assert_cast(*result_type); - const auto & variant_types = result_variant_type.getVariants(); - if (const ColumnNullable * col_nullable = typeid_cast(arguments.front().column.get())) - { - const auto & column = col_nullable->getNestedColumnPtr(); - const auto & null_map = col_nullable->getNullMapData(); - IColumn::Filter filter; - filter.reserve(column->size()); - auto discriminators = ColumnVariant::ColumnDiscriminators::create(); - auto & discriminators_data = discriminators->getData(); - discriminators_data.reserve(column->size()); - size_t variant_size_hint = 0; - for (size_t i = 0; i != column->size(); ++i) - { - if (null_map[i]) - { - discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); - filter.push_back(0); - } - else - { - discriminators_data.push_back(variant_discr); - filter.push_back(1); - ++variant_size_hint; - } - } - - ColumnPtr variant_column; - /// If there were no NULLs, just use the column. - if (variant_size_hint == column->size()) - variant_column = column; - /// Otherwise we should use filtered column. - else - variant_column = column->filter(filter, variant_size_hint); - return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), variant_column, variant_discr); - } - else if (isColumnLowCardinalityNullable(*arguments.front().column)) - { - const auto & column = arguments.front().column; - - /// Variant column cannot have LowCardinality(Nullable(...)) variant, as Variant column stores NULLs itself. - /// We should create a null-map, insert NULL_DISCRIMINATOR on NULL values and filter initial column. - const auto & col_lc = assert_cast(*column); - const auto & indexes = col_lc.getIndexes(); - auto null_index = col_lc.getDictionary().getNullValueIndex(); - IColumn::Filter filter; - filter.reserve(col_lc.size()); - auto discriminators = ColumnVariant::ColumnDiscriminators::create(); - auto & discriminators_data = discriminators->getData(); - discriminators_data.reserve(col_lc.size()); - size_t variant_size_hint = 0; - for (size_t i = 0; i != col_lc.size(); ++i) - { - if (indexes.getUInt(i) == null_index) - { - discriminators_data.push_back(ColumnVariant::NULL_DISCRIMINATOR); - filter.push_back(0); - } - else - { - discriminators_data.push_back(variant_discr); - filter.push_back(1); - ++variant_size_hint; - } - } - - MutableColumnPtr variant_column; - /// If there were no NULLs, we can just clone the column. - if (variant_size_hint == col_lc.size()) - variant_column = IColumn::mutate(column); - /// Otherwise we should filter column. - else - variant_column = column->filter(filter, variant_size_hint)->assumeMutable(); - - assert_cast(*variant_column).nestedRemoveNullable(); - return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), std::move(variant_column), variant_discr); - } - else - { - const auto & column = arguments.front().column; - auto discriminators = ColumnVariant::ColumnDiscriminators::create(); - discriminators->getData().resize_fill(column->size(), variant_discr); - return createVariantFromDescriptorsAndOneNonEmptyVariant(variant_types, std::move(discriminators), column, variant_discr); - } - }; - } - - /// Wrapper for conversion to/from Variant type - WrapperType createVariantWrapper(const DataTypePtr & from_type, const DataTypePtr & to_type) const - { - if (const auto * from_variant = checkAndGetDataType(from_type.get())) - { - if (const auto * to_variant = checkAndGetDataType(to_type.get())) - return createVariantToVariantWrapper(*from_variant, *to_variant); - - return createVariantToColumnWrapper(*from_variant, to_type); - } - - return createColumnToVariantWrapper(from_type, assert_cast(*to_type)); - } - - template - WrapperType createEnumWrapper(const DataTypePtr & from_type, const DataTypeEnum * to_type) const - { - using EnumType = DataTypeEnum; - using Function = typename FunctionTo::Type; - - if (const auto * from_enum8 = checkAndGetDataType(from_type.get())) - checkEnumToEnumConversion(from_enum8, to_type); - else if (const auto * from_enum16 = checkAndGetDataType(from_type.get())) - checkEnumToEnumConversion(from_enum16, to_type); - - if (checkAndGetDataType(from_type.get())) - return createStringToEnumWrapper(); - else if (checkAndGetDataType(from_type.get())) - return createStringToEnumWrapper(); - else if (isNativeNumber(from_type) || isEnum(from_type)) - { - auto function = Function::create(); - return createFunctionAdaptor(function, from_type); - } - else - { - if (cast_type == CastType::accurateOrNull) - return createToNullableColumnWrapper(); - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", - from_type->getName(), to_type->getName()); - } - } - - template - void checkEnumToEnumConversion(const EnumTypeFrom * from_type, const EnumTypeTo * to_type) const - { - const auto & from_values = from_type->getValues(); - const auto & to_values = to_type->getValues(); - - using ValueType = std::common_type_t; - using NameValuePair = std::pair; - using EnumValues = std::vector; - - EnumValues name_intersection; - std::set_intersection(std::begin(from_values), std::end(from_values), - std::begin(to_values), std::end(to_values), std::back_inserter(name_intersection), - [] (auto && from, auto && to) { return from.first < to.first; }); - - for (const auto & name_value : name_intersection) - { - const auto & old_value = name_value.second; - const auto & new_value = to_type->getValue(name_value.first); - if (old_value != new_value) - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Enum conversion changes value for element '{}' from {} to {}", - name_value.first, toString(old_value), toString(new_value)); - } - } - - template - WrapperType createStringToEnumWrapper() const - { - const char * function_name = cast_name; - return [function_name] ( - ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, const ColumnNullable * nullable_col, size_t /*input_rows_count*/) - { - const auto & first_col = arguments.front().column.get(); - const auto & result_type = typeid_cast(*res_type); - - const ColumnStringType * col = typeid_cast(first_col); - - if (col && nullable_col && nullable_col->size() != col->size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ColumnNullable is not compatible with original"); - - if (col) - { - const auto size = col->size(); - - auto res = result_type.createColumn(); - auto & out_data = static_cast(*res).getData(); - out_data.resize(size); - - auto default_enum_value = result_type.getValues().front().second; - - if (nullable_col) - { - for (size_t i = 0; i < size; ++i) - { - if (!nullable_col->isNullAt(i)) - out_data[i] = result_type.getValue(col->getDataAt(i)); - else - out_data[i] = default_enum_value; - } - } - else - { - for (size_t i = 0; i < size; ++i) - out_data[i] = result_type.getValue(col->getDataAt(i)); - } - - return res; - } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column {} as first argument of function {}", - first_col->getName(), function_name); - }; - } - - template - WrapperType createEnumToStringWrapper() const - { - const char * function_name = cast_name; - return [function_name] ( - ColumnsWithTypeAndName & arguments, const DataTypePtr & res_type, const ColumnNullable * nullable_col, size_t /*input_rows_count*/) - { - using ColumnEnumType = EnumType::ColumnType; - - const auto & first_col = arguments.front().column.get(); - const auto & first_type = arguments.front().type.get(); - - const ColumnEnumType * enum_col = typeid_cast(first_col); - const EnumType * enum_type = typeid_cast(first_type); - - if (enum_col && nullable_col && nullable_col->size() != enum_col->size()) - throw Exception(ErrorCodes::LOGICAL_ERROR, "ColumnNullable is not compatible with original"); - - if (enum_col && enum_type) - { - const auto size = enum_col->size(); - const auto & enum_data = enum_col->getData(); - - auto res = res_type->createColumn(); - - if (nullable_col) - { - for (size_t i = 0; i < size; ++i) - { - if (!nullable_col->isNullAt(i)) - { - const auto & value = enum_type->getNameForValue(enum_data[i]); - res->insertData(value.data, value.size); - } - else - res->insertDefault(); - } - } - else - { - for (size_t i = 0; i < size; ++i) - { - const auto & value = enum_type->getNameForValue(enum_data[i]); - res->insertData(value.data, value.size); - } - } - - return res; - } - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected column {} as first argument of function {}", - first_col->getName(), function_name); - }; - } - - static WrapperType createIdentityWrapper(const DataTypePtr &) - { - return [] (ColumnsWithTypeAndName & arguments, const DataTypePtr &, const ColumnNullable *, size_t /*input_rows_count*/) - { - return arguments.front().column; - }; - } - - static WrapperType createNothingWrapper(const IDataType * to_type) - { - ColumnPtr res = to_type->createColumnConstWithDefaultValue(1); - return [res] (ColumnsWithTypeAndName &, const DataTypePtr &, const ColumnNullable *, size_t input_rows_count) - { - /// Column of Nothing type is trivially convertible to any other column - return res->cloneResized(input_rows_count)->convertToFullColumnIfConst(); - }; - } - - WrapperType prepareUnpackDictionaries(const DataTypePtr & from_type, const DataTypePtr & to_type) const - { - /// Conversion from/to Variant data type is processed in a special way. - /// We don't need to remove LowCardinality/Nullable. - if (isVariant(to_type) || isVariant(from_type)) - return createVariantWrapper(from_type, to_type); - - const auto * from_low_cardinality = typeid_cast(from_type.get()); - const auto * to_low_cardinality = typeid_cast(to_type.get()); - const auto & from_nested = from_low_cardinality ? from_low_cardinality->getDictionaryType() : from_type; - const auto & to_nested = to_low_cardinality ? to_low_cardinality->getDictionaryType() : to_type; - - if (from_type->onlyNull()) - { - if (!to_nested->isNullable() && !isVariant(to_type)) - { - if (cast_type == CastType::accurateOrNull) - { - return createToNullableColumnWrapper(); - } - else - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Cannot convert NULL to a non-nullable type"); - } - } - - return [](ColumnsWithTypeAndName &, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) - { - return result_type->createColumnConstWithDefaultValue(input_rows_count)->convertToFullColumnIfConst(); - }; - } - - bool skip_not_null_check = false; - - if (from_low_cardinality && from_nested->isNullable() && !to_nested->isNullable()) - /// Disable check for dictionary. Will check that column doesn't contain NULL in wrapper below. - skip_not_null_check = true; - - auto wrapper = prepareRemoveNullable(from_nested, to_nested, skip_not_null_check); - if (!from_low_cardinality && !to_low_cardinality) - return wrapper; - - return [wrapper, from_low_cardinality, to_low_cardinality, skip_not_null_check] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * nullable_source, size_t input_rows_count) -> ColumnPtr - { - ColumnsWithTypeAndName args = {arguments[0]}; - auto & arg = args.front(); - auto res_type = result_type; - - ColumnPtr converted_column; - - ColumnPtr res_indexes; - /// For some types default can't be casted (for example, String to Int). In that case convert column to full. - bool src_converted_to_full_column = false; - - { - auto tmp_rows_count = input_rows_count; - - if (to_low_cardinality) - res_type = to_low_cardinality->getDictionaryType(); - - if (from_low_cardinality) - { - const auto * col_low_cardinality = typeid_cast(arguments[0].column.get()); - - if (skip_not_null_check && col_low_cardinality->containsNull()) - throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type"); - - arg.column = col_low_cardinality->getDictionary().getNestedColumn(); - arg.type = from_low_cardinality->getDictionaryType(); - - /// TODO: Make map with defaults conversion. - src_converted_to_full_column = !removeNullable(arg.type)->equals(*removeNullable(res_type)); - if (src_converted_to_full_column) - arg.column = arg.column->index(col_low_cardinality->getIndexes(), 0); - else - res_indexes = col_low_cardinality->getIndexesPtr(); - - tmp_rows_count = arg.column->size(); - } - - /// Perform the requested conversion. - converted_column = wrapper(args, res_type, nullable_source, tmp_rows_count); - } - - if (to_low_cardinality) - { - auto res_column = to_low_cardinality->createColumn(); - auto * col_low_cardinality = typeid_cast(res_column.get()); - - if (from_low_cardinality && !src_converted_to_full_column) - { - col_low_cardinality->insertRangeFromDictionaryEncodedColumn(*converted_column, *res_indexes); - } - else - col_low_cardinality->insertRangeFromFullColumn(*converted_column, 0, converted_column->size()); - - return res_column; - } - else if (!src_converted_to_full_column) - return converted_column->index(*res_indexes, 0); - else - return converted_column; - }; - } - - WrapperType prepareRemoveNullable(const DataTypePtr & from_type, const DataTypePtr & to_type, bool skip_not_null_check) const - { - /// Determine whether pre-processing and/or post-processing must take place during conversion. - - bool source_is_nullable = from_type->isNullable(); - bool result_is_nullable = to_type->isNullable(); - - auto wrapper = prepareImpl(removeNullable(from_type), removeNullable(to_type), result_is_nullable); - - if (result_is_nullable) - { - return [wrapper, source_is_nullable] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr - { - /// Create a temporary columns on which to perform the operation. - const auto & nullable_type = static_cast(*result_type); - const auto & nested_type = nullable_type.getNestedType(); - - ColumnsWithTypeAndName tmp_args; - if (source_is_nullable) - tmp_args = createBlockWithNestedColumns(arguments); - else - tmp_args = arguments; - - const ColumnNullable * nullable_source = nullptr; - - /// Add original ColumnNullable for createStringToEnumWrapper() - if (source_is_nullable) - { - if (arguments.size() != 1) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Invalid number of arguments"); - nullable_source = typeid_cast(arguments.front().column.get()); - } - - /// Perform the requested conversion. - auto tmp_res = wrapper(tmp_args, nested_type, nullable_source, input_rows_count); - - /// May happen in fuzzy tests. For debug purpose. - if (!tmp_res) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Couldn't convert {} to {} in prepareRemoveNullable wrapper.", - arguments[0].type->getName(), nested_type->getName()); - - return wrapInNullable(tmp_res, arguments, nested_type, input_rows_count); - }; - } - else if (source_is_nullable) - { - /// Conversion from Nullable to non-Nullable. - - return [wrapper, skip_not_null_check] - (ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr - { - auto tmp_args = createBlockWithNestedColumns(arguments); - auto nested_type = removeNullable(result_type); - - /// Check that all values are not-NULL. - /// Check can be skipped in case if LowCardinality dictionary is transformed. - /// In that case, correctness will be checked beforehand. - if (!skip_not_null_check) - { - const auto & col = arguments[0].column; - const auto & nullable_col = assert_cast(*col); - const auto & null_map = nullable_col.getNullMapData(); - - if (!memoryIsZero(null_map.data(), 0, null_map.size())) - throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type"); - } - const ColumnNullable * nullable_source = typeid_cast(arguments.front().column.get()); - return wrapper(tmp_args, nested_type, nullable_source, input_rows_count); - }; - } - else - return wrapper; - } - - /// 'from_type' and 'to_type' are nested types in case of Nullable. - /// 'requested_result_is_nullable' is true if CAST to Nullable type is requested. - WrapperType prepareImpl(const DataTypePtr & from_type, const DataTypePtr & to_type, bool requested_result_is_nullable) const - { - if (isUInt8(from_type) && isBool(to_type)) - return createUInt8ToBoolWrapper(from_type, to_type); - - /// We can cast IPv6 into IPv6, IPv4 into IPv4, but we should not allow to cast FixedString(16) into IPv6 as part of identity cast - bool safe_convert_custom_types = true; - - if (const auto * to_type_custom_name = to_type->getCustomName()) - safe_convert_custom_types = from_type->getCustomName() && from_type->getCustomName()->getName() == to_type_custom_name->getName(); - else if (const auto * from_type_custom_name = from_type->getCustomName()) - safe_convert_custom_types = to_type->getCustomName() && from_type_custom_name->getName() == to_type->getCustomName()->getName(); - - if (from_type->equals(*to_type) && safe_convert_custom_types) - { - /// We can only use identity conversion for DataTypeAggregateFunction when they are strictly equivalent. - if (typeid_cast(from_type.get())) - { - if (DataTypeAggregateFunction::strictEquals(from_type, to_type)) - return createIdentityWrapper(from_type); - } - else - return createIdentityWrapper(from_type); - } - else if (WhichDataType(from_type).isNothing()) - return createNothingWrapper(to_type.get()); - - WrapperType ret; - - auto make_default_wrapper = [&](const auto & types) -> bool - { - using Types = std::decay_t; - using ToDataType = typename Types::LeftType; - - if constexpr ( - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v || - std::is_same_v) - { - ret = createWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); - return true; - } - if constexpr (std::is_same_v) - { - if (isBool(to_type)) - ret = createBoolWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); - else - ret = createWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); - return true; - } - if constexpr ( - std::is_same_v || - std::is_same_v) - { - ret = createEnumWrapper(from_type, checkAndGetDataType(to_type.get())); - return true; - } - if constexpr ( - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v> || - std::is_same_v) - { - ret = createDecimalWrapper(from_type, checkAndGetDataType(to_type.get()), requested_result_is_nullable); - return true; - } - - return false; - }; - - bool cast_ipv4_ipv6_default_on_conversion_error_value = context && context->getSettingsRef().cast_ipv4_ipv6_default_on_conversion_error; - bool input_format_ipv4_default_on_conversion_error_value = context && context->getSettingsRef().input_format_ipv4_default_on_conversion_error; - bool input_format_ipv6_default_on_conversion_error_value = context && context->getSettingsRef().input_format_ipv6_default_on_conversion_error; - - auto make_custom_serialization_wrapper = [&, cast_ipv4_ipv6_default_on_conversion_error_value, input_format_ipv4_default_on_conversion_error_value, input_format_ipv6_default_on_conversion_error_value](const auto & types) -> bool - { - using Types = std::decay_t; - using ToDataType = typename Types::RightType; - using FromDataType = typename Types::LeftType; - - if constexpr (WhichDataType(FromDataType::type_id).isStringOrFixedString()) - { - if constexpr (std::is_same_v) - { - ret = [cast_ipv4_ipv6_default_on_conversion_error_value, - input_format_ipv4_default_on_conversion_error_value, - requested_result_is_nullable]( - ColumnsWithTypeAndName & arguments, - const DataTypePtr & result_type, - const ColumnNullable * column_nullable, - size_t) -> ColumnPtr - { - if (!WhichDataType(result_type).isIPv4()) - throw Exception(ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv4", result_type->getName()); - - const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - if (requested_result_is_nullable) - return convertToIPv4(arguments[0].column, null_map); - else if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv4_default_on_conversion_error_value) - return convertToIPv4(arguments[0].column, null_map); - else - return convertToIPv4(arguments[0].column, null_map); - }; - - return true; - } - - if constexpr (std::is_same_v) - { - ret = [cast_ipv4_ipv6_default_on_conversion_error_value, - input_format_ipv6_default_on_conversion_error_value, - requested_result_is_nullable]( - ColumnsWithTypeAndName & arguments, - const DataTypePtr & result_type, - const ColumnNullable * column_nullable, - size_t) -> ColumnPtr - { - if (!WhichDataType(result_type).isIPv6()) - throw Exception( - ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv6", result_type->getName()); - - const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - if (requested_result_is_nullable) - return convertToIPv6(arguments[0].column, null_map); - else if (cast_ipv4_ipv6_default_on_conversion_error_value || input_format_ipv6_default_on_conversion_error_value) - return convertToIPv6(arguments[0].column, null_map); - else - return convertToIPv6(arguments[0].column, null_map); - }; - - return true; - } - - if (to_type->getCustomSerialization() && to_type->getCustomName()) - { - ret = [requested_result_is_nullable]( - ColumnsWithTypeAndName & arguments, - const DataTypePtr & result_type, - const ColumnNullable * column_nullable, - size_t input_rows_count) -> ColumnPtr - { - auto wrapped_result_type = result_type; - if (requested_result_is_nullable) - wrapped_result_type = makeNullable(result_type); - return ConvertImplGenericFromString::execute( - arguments, wrapped_result_type, column_nullable, input_rows_count); - }; - return true; - } - } - else if constexpr (WhichDataType(FromDataType::type_id).isIPv6() && WhichDataType(ToDataType::type_id).isIPv4()) - { - ret = [cast_ipv4_ipv6_default_on_conversion_error_value, requested_result_is_nullable]( - ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable * column_nullable, size_t) - -> ColumnPtr - { - if (!WhichDataType(result_type).isIPv4()) - throw Exception( - ErrorCodes::TYPE_MISMATCH, "Wrong result type {}. Expected IPv4", result_type->getName()); - - const auto * null_map = column_nullable ? &column_nullable->getNullMapData() : nullptr; - if (requested_result_is_nullable) - return convertIPv6ToIPv4(arguments[0].column, null_map); - else if (cast_ipv4_ipv6_default_on_conversion_error_value) - return convertIPv6ToIPv4(arguments[0].column, null_map); - else - return convertIPv6ToIPv4(arguments[0].column, null_map); - }; - - return true; - } - - if constexpr (WhichDataType(ToDataType::type_id).isStringOrFixedString()) - { - if constexpr (WhichDataType(FromDataType::type_id).isEnum()) - { - ret = createEnumToStringWrapper(); - return true; - } - else if (from_type->getCustomSerialization()) - { - ret = [](ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, const ColumnNullable *, size_t input_rows_count) -> ColumnPtr - { - return ConvertImplGenericToString::execute(arguments, result_type, input_rows_count); - }; - return true; - } - } - - return false; - }; - - if (callOnTwoTypeIndexes(from_type->getTypeId(), to_type->getTypeId(), make_custom_serialization_wrapper)) - return ret; - - if (callOnIndexAndDataType(to_type->getTypeId(), make_default_wrapper)) - return ret; - - switch (to_type->getTypeId()) - { - case TypeIndex::String: - return createStringWrapper(from_type); - case TypeIndex::FixedString: - return createFixedStringWrapper(from_type, checkAndGetDataType(to_type.get())->getN()); - case TypeIndex::Array: - return createArrayWrapper(from_type, static_cast(*to_type)); - case TypeIndex::Tuple: - return createTupleWrapper(from_type, checkAndGetDataType(to_type.get())); - case TypeIndex::Map: - return createMapWrapper(from_type, checkAndGetDataType(to_type.get())); - case TypeIndex::Object: - return createObjectWrapper(from_type, checkAndGetDataType(to_type.get())); - case TypeIndex::AggregateFunction: - return createAggregateFunctionWrapper(from_type, checkAndGetDataType(to_type.get())); - case TypeIndex::Interval: - return createIntervalWrapper(from_type, checkAndGetDataType(to_type.get())->getKind()); - default: - break; - } - - if (cast_type == CastType::accurateOrNull) - return createToNullableColumnWrapper(); - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion from {} to {} is not supported", - from_type->getName(), to_type->getName()); - } -}; - -class MonotonicityHelper -{ -public: - using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; - - template - static auto monotonicityForType(const DataType * const) - { - return FunctionTo::Type::Monotonic::get; - } - - static MonotonicityForRange getMonotonicityInformation(const DataTypePtr & from_type, const IDataType * to_type); -}; - -FunctionBasePtr createFunctionBaseCast( - ContextPtr context - , const ColumnsWithTypeAndName & arguments - , const DataTypePtr & return_type - , std::optional diagnostic - , CastType cast_type); - -} From 70796e497f1970d186d017226e93a67f1c6d170f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 08:32:13 +0100 Subject: [PATCH 0178/1081] Miscellaneous --- programs/library-bridge/createFunctionBaseCast.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/library-bridge/createFunctionBaseCast.cpp b/programs/library-bridge/createFunctionBaseCast.cpp index 473aa1ca81d..194fc4bfcf7 100644 --- a/programs/library-bridge/createFunctionBaseCast.cpp +++ b/programs/library-bridge/createFunctionBaseCast.cpp @@ -13,7 +13,7 @@ namespace ErrorCodes FunctionBasePtr createFunctionBaseCast( ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for ODBC Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); } } From 3cd70f48b0d2ef1584a9351291de4d230e71c021 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 08:49:55 +0100 Subject: [PATCH 0179/1081] Anonymous --- src/Functions/CastOverloadResolver.cpp | 15 ----------- src/Functions/FunctionsConversion.cpp | 37 +++++++++++++------------- 2 files changed, 19 insertions(+), 33 deletions(-) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 4a081d684f6..ad4b28d11c1 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -117,21 +117,6 @@ private: }; -UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) -{ - const auto * arg_type = named_column.type.get(); - bool ok = checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type) - || checkAndGetDataType(arg_type); - if (!ok) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type of toDecimal() scale {}", named_column.type->getName()); - - Field field; - named_column.column->get(0, field); - return static_cast(field.get()); -} - FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic) { return CastOverloadResolverImpl::create(ContextPtr{}, type, true, diagnostic); diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 865f7db8e12..3da6b9abe2d 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -93,11 +93,27 @@ namespace ErrorCodes extern const int VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE; } +namespace +{ + /** Type conversion functions. * toType - conversion in "natural way"; */ -UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column); +UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) +{ + const auto * arg_type = named_column.type.get(); + bool ok = checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type) + || checkAndGetDataType(arg_type); + if (!ok) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type of toDecimal() scale {}", named_column.type->getName()); + + Field field; + named_column.column->get(0, field); + return static_cast(field.get()); +} /// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; @@ -1271,23 +1287,6 @@ void convertFromTime(typename DataType::FieldType & x, time_t & time) x = time; } -template <> -inline void convertFromTime(DataTypeDate::FieldType & x, time_t & time) -{ - if (unlikely(time < 0)) - x = 0; - else if (unlikely(time > 0xFFFF)) - x = 0xFFFF; - else - x = time; -} - -template <> -inline void convertFromTime(DataTypeDate32::FieldType & x, time_t & time) -{ - x = static_cast(time); -} - template <> inline void convertFromTime(DataTypeDateTime::FieldType & x, time_t & time) { @@ -4960,6 +4959,8 @@ public: } }; +} + FunctionBasePtr createFunctionBaseCast( ContextPtr context From d2e19a5da342236eb377e40a79a1fcd467b13b7a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 08:55:53 +0100 Subject: [PATCH 0180/1081] Remove unused header --- src/Functions/CastOverloadResolver.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index ad4b28d11c1..1e1fbd06d93 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include From 8a46eace13458a735ccce5f423f6b750b0db869a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 09:15:56 +0100 Subject: [PATCH 0181/1081] Less exceptions --- utils/check-style/check-large-objects.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh index 5c1276e5732..4eb9190512f 100755 --- a/utils/check-style/check-large-objects.sh +++ b/utils/check-style/check-large-objects.sh @@ -3,7 +3,6 @@ # Check that there are no new translation units compiled to an object file larger than a certain size. TU_EXCLUDES=( - CastOverloadResolver AggregateFunctionUniq FunctionsConversion From 194de2066c1e60a15d48f21333e97e5ec2d2f82b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 09:22:14 +0100 Subject: [PATCH 0182/1081] Loosen --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b55e9810361..9ffb4789dc9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -61,8 +61,8 @@ if (ENABLE_CHECK_HEAVY_BUILDS) # set CPU time limit to 1000 seconds set (RLIMIT_CPU 1000) - # -fsanitize=memory is too heavy - if (SANITIZE STREQUAL "memory") + # -fsanitize=memory and address are too heavy + if (SANITIZE) set (RLIMIT_DATA 10000000000) # 10G endif() From e9ab3ed2dd8ec6e81a5125c27ae4ad30882ecddf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 09:27:11 +0100 Subject: [PATCH 0183/1081] Even better --- programs/library-bridge/LibraryBridge.h | 2 +- programs/library-bridge/createFunctionBaseCast.cpp | 6 +++++- programs/odbc-bridge/createFunctionBaseCast.cpp | 8 ++++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/programs/library-bridge/LibraryBridge.h b/programs/library-bridge/LibraryBridge.h index 04860a042a3..a8d15a87e07 100644 --- a/programs/library-bridge/LibraryBridge.h +++ b/programs/library-bridge/LibraryBridge.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include "LibraryBridgeHandlerFactory.h" diff --git a/programs/library-bridge/createFunctionBaseCast.cpp b/programs/library-bridge/createFunctionBaseCast.cpp index 194fc4bfcf7..54319ca707a 100644 --- a/programs/library-bridge/createFunctionBaseCast.cpp +++ b/programs/library-bridge/createFunctionBaseCast.cpp @@ -1,5 +1,6 @@ -#include +#include #include +#include namespace DB @@ -10,6 +11,9 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + FunctionBasePtr createFunctionBaseCast( ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { diff --git a/programs/odbc-bridge/createFunctionBaseCast.cpp b/programs/odbc-bridge/createFunctionBaseCast.cpp index 473aa1ca81d..54319ca707a 100644 --- a/programs/odbc-bridge/createFunctionBaseCast.cpp +++ b/programs/odbc-bridge/createFunctionBaseCast.cpp @@ -1,5 +1,6 @@ -#include +#include #include +#include namespace DB @@ -10,10 +11,13 @@ namespace ErrorCodes extern const int NOT_IMPLEMENTED; } +class IFunctionBase; +using FunctionBasePtr = std::shared_ptr; + FunctionBasePtr createFunctionBaseCast( ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for ODBC Bridge"); + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); } } From 3ac4f56cfa65443119bd0fa9d23d528adf73d7bb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 9 Mar 2024 18:53:31 +0100 Subject: [PATCH 0184/1081] Fix tests --- .../library-bridge/createFunctionBaseCast.cpp | 2 +- .../odbc-bridge/createFunctionBaseCast.cpp | 2 +- src/Functions/CastOverloadResolver.cpp | 20 ++++++++++++------- src/Functions/FunctionsConversion.cpp | 13 ++++++------ 4 files changed, 22 insertions(+), 15 deletions(-) diff --git a/programs/library-bridge/createFunctionBaseCast.cpp b/programs/library-bridge/createFunctionBaseCast.cpp index 54319ca707a..dcdd47d79ce 100644 --- a/programs/library-bridge/createFunctionBaseCast.cpp +++ b/programs/library-bridge/createFunctionBaseCast.cpp @@ -15,7 +15,7 @@ class IFunctionBase; using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( - ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); } diff --git a/programs/odbc-bridge/createFunctionBaseCast.cpp b/programs/odbc-bridge/createFunctionBaseCast.cpp index 54319ca707a..dcdd47d79ce 100644 --- a/programs/odbc-bridge/createFunctionBaseCast.cpp +++ b/programs/odbc-bridge/createFunctionBaseCast.cpp @@ -15,7 +15,7 @@ class IFunctionBase; using FunctionBasePtr = std::shared_ptr; FunctionBasePtr createFunctionBaseCast( - ContextPtr, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) + ContextPtr, const char *, const ColumnsWithTypeAndName &, const DataTypePtr &, std::optional, CastType) { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Type conversions are not implemented for Library Bridge"); } diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 1e1fbd06d93..4e8e99b4d95 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -17,11 +17,12 @@ namespace ErrorCodes } FunctionBasePtr createFunctionBaseCast( - ContextPtr context - , const ColumnsWithTypeAndName & arguments - , const DataTypePtr & return_type - , std::optional diagnostic - , CastType cast_type); + ContextPtr context, + const char * name, + const ColumnsWithTypeAndName & arguments, + const DataTypePtr & return_type, + std::optional diagnostic, + CastType cast_type); /** CastInternal does not preserve nullability of the data type, @@ -33,7 +34,7 @@ FunctionBasePtr createFunctionBaseCast( class CastOverloadResolverImpl : public IFunctionOverloadResolver { public: - String getName() const override + const char * getNameImpl() const { if (cast_type == CastType::accurate) return "accurateCast"; @@ -45,6 +46,11 @@ public: return "CAST"; } + String getName() const override + { + return getNameImpl(); + } + size_t getNumberOfArguments() const override { return 2; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } @@ -72,7 +78,7 @@ public: protected: FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override { - return createFunctionBaseCast(context, arguments, return_type, diagnostic, cast_type); + return createFunctionBaseCast(context, getNameImpl(), arguments, return_type, diagnostic, cast_type); } DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 3da6b9abe2d..d1d00780205 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -4963,11 +4963,12 @@ public: FunctionBasePtr createFunctionBaseCast( - ContextPtr context - , const ColumnsWithTypeAndName & arguments - , const DataTypePtr & return_type - , std::optional diagnostic - , CastType cast_type) + ContextPtr context, + const char * name, + const ColumnsWithTypeAndName & arguments, + const DataTypePtr & return_type, + std::optional diagnostic, + CastType cast_type) { DataTypes data_types(arguments.size()); @@ -4975,7 +4976,7 @@ FunctionBasePtr createFunctionBaseCast( data_types[i] = arguments[i].type; auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get()); - return std::make_unique(context, "CAST", std::move(monotonicity), data_types, return_type, diagnostic, cast_type); + return std::make_unique(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type); } REGISTER_FUNCTION(Conversion) From e30a1c9cbcc465370aa3be5c96437598c1a3ac80 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 02:26:18 +0100 Subject: [PATCH 0185/1081] Remove garbage --- src/Functions/FunctionsConversion.cpp | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index d1d00780205..5347e1f739d 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -3158,15 +3158,10 @@ struct FunctionCastName static constexpr auto name = "CAST"; }; -class FunctionCastBase : public IFunctionBase +class FunctionCast final : public IFunctionBase { public: using MonotonicityForRange = std::function; -}; - -class FunctionCast final : public FunctionCastBase -{ -public: using WrapperType = std::function; FunctionCast(ContextPtr context_ @@ -4901,7 +4896,7 @@ arguments, result_type, input_rows_count); \ class MonotonicityHelper { public: - using MonotonicityForRange = FunctionCastBase::MonotonicityForRange; + using MonotonicityForRange = FunctionCast::MonotonicityForRange; template static auto monotonicityForType(const DataType * const) From bbeecf4ed5361154c2eed61f476ed9ca2c50b773 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 02:51:32 +0100 Subject: [PATCH 0186/1081] Fix error --- src/Functions/CastOverloadResolver.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp index 4e8e99b4d95..5ca4b0bc579 100644 --- a/src/Functions/CastOverloadResolver.cpp +++ b/src/Functions/CastOverloadResolver.cpp @@ -67,12 +67,15 @@ public: static FunctionOverloadResolverPtr create(ContextPtr context, CastType cast_type, bool internal, std::optional diagnostic) { - const auto & settings_ref = context->getSettingsRef(); - if (internal) + { return std::make_unique(context, cast_type, internal, diagnostic, false /*keep_nullable*/, DataTypeValidationSettings{}); + } else + { + const auto & settings_ref = context->getSettingsRef(); return std::make_unique(context, cast_type, internal, diagnostic, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref)); + } } protected: From 6c0f8773426021d93229b2c713641e189c4ffe3e Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 03:02:38 +0100 Subject: [PATCH 0187/1081] I did not understand this code and removed it --- src/Functions/FunctionsConversion.cpp | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 5347e1f739d..38c18ddf850 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -1906,18 +1906,6 @@ struct ConvertImpl } }; -template -struct ConvertImpl -{ - template - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, - Additions additions [[maybe_unused]] = Additions()) - { - - return arguments[0].column; - } -}; - /** Conversion from FixedString to String. * Cutting sequences of zero bytes from end of strings. From 7c8a97a91a0842556e1ed67234c85c7aead67700 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 03:13:15 +0100 Subject: [PATCH 0188/1081] Maybe better --- src/Functions/FunctionsConversion.cpp | 425 +++++++++++++------------- 1 file changed, 206 insertions(+), 219 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 38c18ddf850..1d849446254 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -151,255 +151,256 @@ struct ConvertImpl { const ColumnWithTypeAndName & named_from = arguments[0]; + /// If types are the same, reuse the columns. + if (result_type->equals(*named_from.type)) + return named_from.column; + using ColVecFrom = typename FromDataType::ColumnType; using ColVecTo = typename ToDataType::ColumnType; if constexpr ((IsDataTypeDecimal || IsDataTypeDecimal) - && !(std::is_same_v || std::is_same_v)) + && !(std::is_same_v || std::is_same_v) + && (!IsDataTypeDecimalOrNumber || !IsDataTypeDecimalOrNumber)) { - if constexpr (!IsDataTypeDecimalOrNumber || !IsDataTypeDecimalOrNumber) - { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - named_from.column->getName(), Name::name); - } + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + named_from.column->getName(), Name::name); } - if (const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get())) + const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get()); + if (!col_from) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + named_from.column->getName(), Name::name); + + typename ColVecTo::MutablePtr col_to = nullptr; + + if constexpr (IsDataTypeDecimal) { - typename ColVecTo::MutablePtr col_to = nullptr; + UInt32 scale; - if constexpr (IsDataTypeDecimal) + if constexpr (std::is_same_v + || std::is_same_v) { - UInt32 scale; - - if constexpr (std::is_same_v - || std::is_same_v) - { - scale = additions.scale; - } - else - { - scale = additions; - } - - col_to = ColVecTo::create(0, scale); + scale = additions.scale; } else - col_to = ColVecTo::create(); - - const auto & vec_from = col_from->getData(); - auto & vec_to = col_to->getData(); - vec_to.resize(input_rows_count); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; - if constexpr (std::is_same_v) { - col_null_map_to = ColumnUInt8::create(input_rows_count, false); - vec_null_map_to = &col_null_map_to->getData(); + scale = additions; } - bool result_is_bool = isBool(result_type); - for (size_t i = 0; i < input_rows_count; ++i) + col_to = ColVecTo::create(0, scale); + } + else + col_to = ColVecTo::create(); + + const auto & vec_from = col_from->getData(); + auto & vec_to = col_to->getData(); + vec_to.resize(input_rows_count); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; + if constexpr (std::is_same_v) + { + col_null_map_to = ColumnUInt8::create(input_rows_count, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + bool result_is_bool = isBool(result_type); + for (size_t i = 0; i < input_rows_count; ++i) + { + if constexpr (std::is_same_v) { - if constexpr (std::is_same_v) + if (result_is_bool) { - if (result_is_bool) - { - vec_to[i] = vec_from[i] != FromFieldType(0); - continue; - } - } - - if constexpr (std::is_same_v && std::is_same_v) - { - static_assert( - std::is_same_v, - "UInt128 and UUID types must be same"); - - vec_to[i].items[1] = vec_from[i].toUnderType().items[0]; - vec_to[i].items[0] = vec_from[i].toUnderType().items[1]; - + vec_to[i] = vec_from[i] != FromFieldType(0); continue; } + } - if constexpr (std::is_same_v && std::is_same_v) - { - static_assert( - std::is_same_v, - "UInt128 and IPv6 types must be same"); + if constexpr (std::is_same_v && std::is_same_v) + { + static_assert( + std::is_same_v, + "UInt128 and UUID types must be same"); - vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]); - vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]); + vec_to[i].items[1] = vec_from[i].toUnderType().items[0]; + vec_to[i].items[0] = vec_from[i].toUnderType().items[1]; - continue; - } + continue; + } - if constexpr (std::is_same_v != std::is_same_v) + if constexpr (std::is_same_v && std::is_same_v) + { + static_assert( + std::is_same_v, + "UInt128 and IPv6 types must be same"); + + vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]); + vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]); + + continue; + } + + if constexpr (std::is_same_v != std::is_same_v) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Conversion between numeric types and UUID is not supported. " + "Probably the passed UUID is unquoted"); + } + else if constexpr ( + (std::is_same_v != std::is_same_v) + && !(is_any_of || is_any_of) + ) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from {} to {} is not supported", + TypeName, TypeName); + } + else if constexpr (std::is_same_v != std::is_same_v && !(std::is_same_v || std::is_same_v)) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Conversion between numeric types and IPv6 is not supported. " + "Probably the passed IPv6 is unquoted"); + } + else + { + if constexpr (IsDataTypeDecimal || IsDataTypeDecimal) { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Conversion between numeric types and UUID is not supported. " - "Probably the passed UUID is unquoted"); - } - else if constexpr ( - (std::is_same_v != std::is_same_v) - && !(is_any_of || is_any_of) - ) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from {} to {} is not supported", - TypeName, TypeName); - } - else if constexpr (std::is_same_v != std::is_same_v && !(std::is_same_v || std::is_same_v)) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Conversion between numeric types and IPv6 is not supported. " - "Probably the passed IPv6 is unquoted"); - } - else - { - if constexpr (IsDataTypeDecimal || IsDataTypeDecimal) + if constexpr (std::is_same_v) { - if constexpr (std::is_same_v) - { - ToFieldType result; - bool convert_result = false; + ToFieldType result; + bool convert_result = false; - if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) - convert_result = tryConvertDecimals(vec_from[i], col_from->getScale(), col_to->getScale(), result); - else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) - convert_result = tryConvertFromDecimal(vec_from[i], col_from->getScale(), result); - else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) - convert_result = tryConvertToDecimal(vec_from[i], col_to->getScale(), result); + if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + convert_result = tryConvertDecimals(vec_from[i], col_from->getScale(), col_to->getScale(), result); + else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) + convert_result = tryConvertFromDecimal(vec_from[i], col_from->getScale(), result); + else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) + convert_result = tryConvertToDecimal(vec_from[i], col_to->getScale(), result); - if (convert_result) - vec_to[i] = result; - else - { - vec_to[i] = static_cast(0); - (*vec_null_map_to)[i] = true; - } - } + if (convert_result) + vec_to[i] = result; else { - if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) - vec_to[i] = convertDecimals(vec_from[i], col_from->getScale(), col_to->getScale()); - else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) - vec_to[i] = convertFromDecimal(vec_from[i], col_from->getScale()); - else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) - vec_to[i] = convertToDecimal(vec_from[i], col_to->getScale()); - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unsupported data type in conversion function"); + vec_to[i] = static_cast(0); + (*vec_null_map_to)[i] = true; } } else { - /// If From Data is Nan or Inf and we convert to integer type, throw exception - if constexpr (std::is_floating_point_v && !std::is_floating_point_v) - { - if (!isFinite(vec_from[i])) - { - if constexpr (std::is_same_v) - { - vec_to[i] = 0; - (*vec_null_map_to)[i] = true; - continue; - } - else - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unexpected inf or nan to integer conversion"); - } - } - - if constexpr (std::is_same_v - || std::is_same_v) - { - bool convert_result = accurate::convertNumeric(vec_from[i], vec_to[i]); - - if (!convert_result) - { - if (std::is_same_v) - { - vec_to[i] = 0; - (*vec_null_map_to)[i] = true; - } - else - { - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", - named_from.column->getName(), result_type->getName()); - } - } - } + if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) + vec_to[i] = convertDecimals(vec_from[i], col_from->getScale(), col_to->getScale()); + else if constexpr (IsDataTypeDecimal && IsDataTypeNumber) + vec_to[i] = convertFromDecimal(vec_from[i], col_from->getScale()); + else if constexpr (IsDataTypeNumber && IsDataTypeDecimal) + vec_to[i] = convertToDecimal(vec_from[i], col_to->getScale()); else + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unsupported data type in conversion function"); + } + } + else + { + /// If From Data is Nan or Inf and we convert to integer type, throw exception + if constexpr (std::is_floating_point_v && !std::is_floating_point_v) + { + if (!isFinite(vec_from[i])) { - if constexpr (std::is_same_v && std::is_same_v) + if constexpr (std::is_same_v) { - const uint8_t ip4_cidr[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; - const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); - if (!matchIPv6Subnet(src, ip4_cidr, 96)) - { - char addr[IPV6_MAX_TEXT_LENGTH + 1] {}; - char * paddr = addr; - formatIPv6(src, paddr); - - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "IPv6 {} in column {} is not in IPv4 mapping block", addr, named_from.column->getName()); - } - - uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); - if constexpr (std::endian::native == std::endian::little) - { - dst[0] = src[15]; - dst[1] = src[14]; - dst[2] = src[13]; - dst[3] = src[12]; - } - else - { - dst[0] = src[12]; - dst[1] = src[13]; - dst[2] = src[14]; - dst[3] = src[15]; - } + vec_to[i] = 0; + (*vec_null_map_to)[i] = true; + continue; } - else if constexpr (std::is_same_v && std::is_same_v) - { - const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); - uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); - std::memset(dst, '\0', IPV6_BINARY_LENGTH); - dst[10] = dst[11] = 0xff; - - if constexpr (std::endian::native == std::endian::little) - { - dst[12] = src[3]; - dst[13] = src[2]; - dst[14] = src[1]; - dst[15] = src[0]; - } - else - { - dst[12] = src[0]; - dst[13] = src[1]; - dst[14] = src[2]; - dst[15] = src[3]; - } - } - else if constexpr (std::is_same_v && std::is_same_v) - vec_to[i] = static_cast(static_cast(vec_from[i])); - else if constexpr (std::is_same_v && (std::is_same_v || std::is_same_v)) - vec_to[i] = static_cast(vec_from[i] * DATE_SECONDS_PER_DAY); else - vec_to[i] = static_cast(vec_from[i]); + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unexpected inf or nan to integer conversion"); } } + + if constexpr (std::is_same_v + || std::is_same_v) + { + bool convert_result = accurate::convertNumeric(vec_from[i], vec_to[i]); + + if (!convert_result) + { + if (std::is_same_v) + { + vec_to[i] = 0; + (*vec_null_map_to)[i] = true; + } + else + { + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Value in column {} cannot be safely converted into type {}", + named_from.column->getName(), result_type->getName()); + } + } + } + else + { + if constexpr (std::is_same_v && std::is_same_v) + { + const uint8_t ip4_cidr[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; + const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); + if (!matchIPv6Subnet(src, ip4_cidr, 96)) + { + char addr[IPV6_MAX_TEXT_LENGTH + 1] {}; + char * paddr = addr; + formatIPv6(src, paddr); + + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "IPv6 {} in column {} is not in IPv4 mapping block", addr, named_from.column->getName()); + } + + uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); + if constexpr (std::endian::native == std::endian::little) + { + dst[0] = src[15]; + dst[1] = src[14]; + dst[2] = src[13]; + dst[3] = src[12]; + } + else + { + dst[0] = src[12]; + dst[1] = src[13]; + dst[2] = src[14]; + dst[3] = src[15]; + } + } + else if constexpr (std::is_same_v && std::is_same_v) + { + const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); + uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); + std::memset(dst, '\0', IPV6_BINARY_LENGTH); + dst[10] = dst[11] = 0xff; + + if constexpr (std::endian::native == std::endian::little) + { + dst[12] = src[3]; + dst[13] = src[2]; + dst[14] = src[1]; + dst[15] = src[0]; + } + else + { + dst[12] = src[0]; + dst[13] = src[1]; + dst[14] = src[2]; + dst[15] = src[3]; + } + } + else if constexpr (std::is_same_v && std::is_same_v) + vec_to[i] = static_cast(static_cast(vec_from[i])); + else if constexpr (std::is_same_v && (std::is_same_v || std::is_same_v)) + vec_to[i] = static_cast(vec_from[i] * DATE_SECONDS_PER_DAY); + else + vec_to[i] = static_cast(vec_from[i]); + } } } - - if constexpr (std::is_same_v) - return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); - else - return col_to; } + + if constexpr (std::is_same_v) + return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); else - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - named_from.column->getName(), Name::name); + return col_to; } }; @@ -1892,20 +1893,6 @@ template <> struct ConvertImpl : ConvertImpl {}; -/** If types are identical, just take reference to column. - */ -template -requires (!T::is_parametric) -struct ConvertImpl -{ - template - static ColumnPtr execute(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/, - Additions additions [[maybe_unused]] = Additions()) - { - return arguments[0].column; - } -}; - /** Conversion from FixedString to String. * Cutting sequences of zero bytes from end of strings. From 3c9e6cfc96c8ba7cbc805260bfdec9ca18776ad0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 03:52:14 +0100 Subject: [PATCH 0189/1081] Maybe better --- src/Functions/FunctionsConversion.cpp | 322 +++++++++++++------------- 1 file changed, 161 insertions(+), 161 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 1d849446254..66ba2d18f08 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -152,117 +152,113 @@ struct ConvertImpl const ColumnWithTypeAndName & named_from = arguments[0]; /// If types are the same, reuse the columns. - if (result_type->equals(*named_from.type)) + if constexpr (std::is_same_v && !FromDataType::is_parametric) + { return named_from.column; - - using ColVecFrom = typename FromDataType::ColumnType; - using ColVecTo = typename ToDataType::ColumnType; - - if constexpr ((IsDataTypeDecimal || IsDataTypeDecimal) - && !(std::is_same_v || std::is_same_v) - && (!IsDataTypeDecimalOrNumber || !IsDataTypeDecimalOrNumber)) - { - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - named_from.column->getName(), Name::name); - } - - const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get()); - if (!col_from) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", - named_from.column->getName(), Name::name); - - typename ColVecTo::MutablePtr col_to = nullptr; - - if constexpr (IsDataTypeDecimal) - { - UInt32 scale; - - if constexpr (std::is_same_v - || std::is_same_v) - { - scale = additions.scale; - } - else - { - scale = additions; - } - - col_to = ColVecTo::create(0, scale); } else - col_to = ColVecTo::create(); - - const auto & vec_from = col_from->getData(); - auto & vec_to = col_to->getData(); - vec_to.resize(input_rows_count); - - ColumnUInt8::MutablePtr col_null_map_to; - ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; - if constexpr (std::is_same_v) { - col_null_map_to = ColumnUInt8::create(input_rows_count, false); - vec_null_map_to = &col_null_map_to->getData(); - } + using ColVecFrom = typename FromDataType::ColumnType; + using ColVecTo = typename ToDataType::ColumnType; - bool result_is_bool = isBool(result_type); - for (size_t i = 0; i < input_rows_count; ++i) - { - if constexpr (std::is_same_v) + if constexpr ((IsDataTypeDecimal || IsDataTypeDecimal) + && !(std::is_same_v || std::is_same_v) + && (!IsDataTypeDecimalOrNumber || !IsDataTypeDecimalOrNumber)) { - if (result_is_bool) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + named_from.column->getName(), Name::name); + } + + const ColVecFrom * col_from = checkAndGetColumn(named_from.column.get()); + if (!col_from) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}", + named_from.column->getName(), Name::name); + + typename ColVecTo::MutablePtr col_to = nullptr; + + if constexpr (IsDataTypeDecimal) + { + UInt32 scale; + + if constexpr (std::is_same_v + || std::is_same_v) { - vec_to[i] = vec_from[i] != FromFieldType(0); - continue; + scale = additions.scale; + } + else + { + scale = additions; } - } - if constexpr (std::is_same_v && std::is_same_v) - { - static_assert( - std::is_same_v, - "UInt128 and UUID types must be same"); - - vec_to[i].items[1] = vec_from[i].toUnderType().items[0]; - vec_to[i].items[0] = vec_from[i].toUnderType().items[1]; - - continue; - } - - if constexpr (std::is_same_v && std::is_same_v) - { - static_assert( - std::is_same_v, - "UInt128 and IPv6 types must be same"); - - vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]); - vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]); - - continue; - } - - if constexpr (std::is_same_v != std::is_same_v) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Conversion between numeric types and UUID is not supported. " - "Probably the passed UUID is unquoted"); - } - else if constexpr ( - (std::is_same_v != std::is_same_v) - && !(is_any_of || is_any_of) - ) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from {} to {} is not supported", - TypeName, TypeName); - } - else if constexpr (std::is_same_v != std::is_same_v && !(std::is_same_v || std::is_same_v)) - { - throw Exception(ErrorCodes::NOT_IMPLEMENTED, - "Conversion between numeric types and IPv6 is not supported. " - "Probably the passed IPv6 is unquoted"); + col_to = ColVecTo::create(0, scale); } else + col_to = ColVecTo::create(); + + const auto & vec_from = col_from->getData(); + auto & vec_to = col_to->getData(); + vec_to.resize(input_rows_count); + + ColumnUInt8::MutablePtr col_null_map_to; + ColumnUInt8::Container * vec_null_map_to [[maybe_unused]] = nullptr; + if constexpr (std::is_same_v) { - if constexpr (IsDataTypeDecimal || IsDataTypeDecimal) + col_null_map_to = ColumnUInt8::create(input_rows_count, false); + vec_null_map_to = &col_null_map_to->getData(); + } + + bool result_is_bool = isBool(result_type); + for (size_t i = 0; i < input_rows_count; ++i) + { + if constexpr (std::is_same_v) + { + if (result_is_bool) + { + vec_to[i] = vec_from[i] != FromFieldType(0); + continue; + } + } + + if constexpr (std::is_same_v && std::is_same_v) + { + static_assert( + std::is_same_v, + "UInt128 and UUID types must be same"); + + vec_to[i].items[1] = vec_from[i].toUnderType().items[0]; + vec_to[i].items[0] = vec_from[i].toUnderType().items[1]; + } + else if constexpr (std::is_same_v && std::is_same_v) + { + static_assert( + std::is_same_v, + "UInt128 and IPv6 types must be same"); + + vec_to[i].items[1] = std::byteswap(vec_from[i].toUnderType().items[0]); + vec_to[i].items[0] = std::byteswap(vec_from[i].toUnderType().items[1]); + } + else if constexpr (std::is_same_v != std::is_same_v) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Conversion between numeric types and UUID is not supported. " + "Probably the passed UUID is unquoted"); + } + else if constexpr ( + (std::is_same_v != std::is_same_v) + && !(is_any_of + || is_any_of)) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from {} to {} is not supported", + TypeName, TypeName); + } + else if constexpr (std::is_same_v != std::is_same_v + && !(std::is_same_v || std::is_same_v)) + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, + "Conversion between numeric types and IPv6 is not supported. " + "Probably the passed IPv6 is unquoted"); + } + else if constexpr (IsDataTypeDecimal || IsDataTypeDecimal) { if constexpr (std::is_same_v) { @@ -296,6 +292,66 @@ struct ConvertImpl throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "Unsupported data type in conversion function"); } } + else if constexpr (std::is_same_v && std::is_same_v) + { + const uint8_t ip4_cidr[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; + const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); + if (!matchIPv6Subnet(src, ip4_cidr, 96)) + { + char addr[IPV6_MAX_TEXT_LENGTH + 1] {}; + char * paddr = addr; + formatIPv6(src, paddr); + + throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "IPv6 {} in column {} is not in IPv4 mapping block", addr, named_from.column->getName()); + } + + uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); + if constexpr (std::endian::native == std::endian::little) + { + dst[0] = src[15]; + dst[1] = src[14]; + dst[2] = src[13]; + dst[3] = src[12]; + } + else + { + dst[0] = src[12]; + dst[1] = src[13]; + dst[2] = src[14]; + dst[3] = src[15]; + } + } + else if constexpr (std::is_same_v && std::is_same_v) + { + const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); + uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); + std::memset(dst, '\0', IPV6_BINARY_LENGTH); + dst[10] = dst[11] = 0xff; + + if constexpr (std::endian::native == std::endian::little) + { + dst[12] = src[3]; + dst[13] = src[2]; + dst[14] = src[1]; + dst[15] = src[0]; + } + else + { + dst[12] = src[0]; + dst[13] = src[1]; + dst[14] = src[2]; + dst[15] = src[3]; + } + } + else if constexpr (std::is_same_v && std::is_same_v) + { + vec_to[i] = static_cast(static_cast(vec_from[i])); + } + else if constexpr (std::is_same_v + && (std::is_same_v || std::is_same_v)) + { + vec_to[i] = static_cast(vec_from[i] * DATE_SECONDS_PER_DAY); + } else { /// If From Data is Nan or Inf and we convert to integer type, throw exception @@ -335,72 +391,16 @@ struct ConvertImpl } else { - if constexpr (std::is_same_v && std::is_same_v) - { - const uint8_t ip4_cidr[] {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00}; - const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); - if (!matchIPv6Subnet(src, ip4_cidr, 96)) - { - char addr[IPV6_MAX_TEXT_LENGTH + 1] {}; - char * paddr = addr; - formatIPv6(src, paddr); - - throw Exception(ErrorCodes::CANNOT_CONVERT_TYPE, "IPv6 {} in column {} is not in IPv4 mapping block", addr, named_from.column->getName()); - } - - uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); - if constexpr (std::endian::native == std::endian::little) - { - dst[0] = src[15]; - dst[1] = src[14]; - dst[2] = src[13]; - dst[3] = src[12]; - } - else - { - dst[0] = src[12]; - dst[1] = src[13]; - dst[2] = src[14]; - dst[3] = src[15]; - } - } - else if constexpr (std::is_same_v && std::is_same_v) - { - const uint8_t * src = reinterpret_cast(&vec_from[i].toUnderType()); - uint8_t * dst = reinterpret_cast(&vec_to[i].toUnderType()); - std::memset(dst, '\0', IPV6_BINARY_LENGTH); - dst[10] = dst[11] = 0xff; - - if constexpr (std::endian::native == std::endian::little) - { - dst[12] = src[3]; - dst[13] = src[2]; - dst[14] = src[1]; - dst[15] = src[0]; - } - else - { - dst[12] = src[0]; - dst[13] = src[1]; - dst[14] = src[2]; - dst[15] = src[3]; - } - } - else if constexpr (std::is_same_v && std::is_same_v) - vec_to[i] = static_cast(static_cast(vec_from[i])); - else if constexpr (std::is_same_v && (std::is_same_v || std::is_same_v)) - vec_to[i] = static_cast(vec_from[i] * DATE_SECONDS_PER_DAY); - else - vec_to[i] = static_cast(vec_from[i]); + vec_to[i] = static_cast(vec_from[i]); } } } - } - if constexpr (std::is_same_v) - return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); - else - return col_to; + if constexpr (std::is_same_v) + return ColumnNullable::create(std::move(col_to), std::move(col_null_map_to)); + else + return col_to; + } } }; From d2e29525c85458c07f385208582e820de10297e4 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 04:08:34 +0100 Subject: [PATCH 0190/1081] Maybe better --- src/Functions/FunctionsConversion.cpp | 176 +++++++++++++------------- 1 file changed, 87 insertions(+), 89 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index 66ba2d18f08..d2d72558500 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -1,4 +1,3 @@ -#include #include #include @@ -115,6 +114,71 @@ UInt32 extractToDecimalScale(const ColumnWithTypeAndName & named_column) return static_cast(field.get()); } + +/** Conversion of Date to DateTime: adding 00:00:00 time component. + */ +template +struct ToDateTimeImpl +{ + static constexpr auto name = "toDateTime"; + + static UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (d > MAX_DATETIME_DAY_NUM) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Day number {} is out of bounds of type DateTime", d); + } + else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) + { + if (d > MAX_DATETIME_DAY_NUM) + d = MAX_DATETIME_DAY_NUM; + } + return static_cast(time_zone.fromDayNum(DayNum(d))); + } + + static UInt32 execute(Int32 d, const DateLUTImpl & time_zone) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) + { + if (d < 0) + return 0; + else if (d > MAX_DATETIME_DAY_NUM) + d = MAX_DATETIME_DAY_NUM; + } + else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) + { + if (d < 0 || d > MAX_DATETIME_DAY_NUM) [[unlikely]] + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", d); + } + return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))); + } + + static UInt32 execute(UInt32 dt, const DateLUTImpl & /*time_zone*/) + { + return dt; + } + + static UInt32 execute(Int64 dt64, const DateLUTImpl & /*time_zone*/) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Ignore) + return static_cast(dt64); + else + { + if (dt64 < 0 || dt64 >= MAX_DATETIME_TIMESTAMP) + { + if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) + return dt64 < 0 ? 0 : std::numeric_limits::max(); + else + throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", dt64); + } + else + return static_cast(dt64); + } + } +}; + + /// Function toUnixTimestamp has exactly the same implementation as toDateTime of String type. struct NameToUnixTimestamp { static constexpr auto name = "toUnixTimestamp"; }; @@ -151,11 +215,32 @@ struct ConvertImpl { const ColumnWithTypeAndName & named_from = arguments[0]; - /// If types are the same, reuse the columns. if constexpr (std::is_same_v && !FromDataType::is_parametric) { + /// If types are the same, reuse the columns. return named_from.column; } + else if constexpr ((std::is_same_v || std::is_same_v) + && std::is_same_v) + { + /// Conversion of DateTime to Date: throw off time component. + /// Conversion of Date32 to Date. + return DateTimeTransformImpl, false>::execute( + arguments, result_type, input_rows_count); + } + else if constexpr (std::is_same_v && std::is_same_v) + { + /// Conversion of DateTime to Date: throw off time component. + return DateTimeTransformImpl::execute( + arguments, result_type, input_rows_count); + } + else if constexpr ((std::is_same_v || std::is_same_v) + && std::is_same_v) + { + /// Conversion from Date/Date32 to DateTime. + return DateTimeTransformImpl, false>::execute( + arguments, result_type, input_rows_count); + } else { using ColVecFrom = typename FromDataType::ColumnType; @@ -404,88 +489,6 @@ struct ConvertImpl } }; -/** Conversion of DateTime to Date: throw off time component. - */ -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -/** Conversion of DateTime to Date32: throw off time component. - */ -template -struct ConvertImpl - : DateTimeTransformImpl {}; - -/** Conversion of Date to DateTime: adding 00:00:00 time component. - */ -template -struct ToDateTimeImpl -{ - static constexpr auto name = "toDateTime"; - - static UInt32 execute(UInt16 d, const DateLUTImpl & time_zone) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (d > MAX_DATETIME_DAY_NUM) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Day number {} is out of bounds of type DateTime", d); - } - else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) - { - if (d > MAX_DATETIME_DAY_NUM) - d = MAX_DATETIME_DAY_NUM; - } - return static_cast(time_zone.fromDayNum(DayNum(d))); - } - - static UInt32 execute(Int32 d, const DateLUTImpl & time_zone) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) - { - if (d < 0) - return 0; - else if (d > MAX_DATETIME_DAY_NUM) - d = MAX_DATETIME_DAY_NUM; - } - else if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Throw) - { - if (d < 0 || d > MAX_DATETIME_DAY_NUM) [[unlikely]] - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", d); - } - return static_cast(time_zone.fromDayNum(ExtendedDayNum(d))); - } - - static UInt32 execute(UInt32 dt, const DateLUTImpl & /*time_zone*/) - { - return dt; - } - - static UInt32 execute(Int64 dt64, const DateLUTImpl & /*time_zone*/) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Ignore) - return static_cast(dt64); - else - { - if (dt64 < 0 || dt64 >= MAX_DATETIME_TIMESTAMP) - { - if constexpr (date_time_overflow_behavior == FormatSettings::DateTimeOverflowBehavior::Saturate) - return dt64 < 0 ? 0 : std::numeric_limits::max(); - else - throw Exception(ErrorCodes::VALUE_IS_OUT_OF_RANGE_OF_DATA_TYPE, "Value {} is out of bounds of type DateTime", dt64); - } - else - return static_cast(dt64); - } - } -}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; - -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; /// Implementation of toDate function. @@ -510,11 +513,6 @@ struct ToDateTransform32Or64 } }; -/** Conversion of Date32 to Date. - */ -template -struct ConvertImpl - : DateTimeTransformImpl, false> {}; template struct ToDateTransform32Or64Signed From b07e5d9f5a6420a9ba3d19d63070831a9db121ea Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 08:11:59 +0300 Subject: [PATCH 0191/1081] Update StorageMergeTree.cpp --- src/Storages/StorageMergeTree.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 0748ac2dbdf..055a48ad998 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2083,7 +2083,7 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; - LOG_TRACE(log, "Partition exps are the same:part id: {}; number of disks:{}",dst_part_info.partition_id, this->getStoragePolicy()->getDisks().size()); + bool on_same_disk = false; for (const DiskPtr & disk : this->getStoragePolicy()->getDisks()) if (disk->getName() == src_part->getDataPartStorage().getDiskName()) From 24a248859d97f7faf78a1dc98a056332cde97401 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 06:52:47 +0100 Subject: [PATCH 0192/1081] Remove unneeded file --- src/Compression/examples/CMakeLists.txt | 3 - .../cached_compressed_read_buffer.cpp | 79 ------------------- .../examples/compressed_buffer.cpp | 3 - 3 files changed, 85 deletions(-) delete mode 100644 src/Compression/examples/cached_compressed_read_buffer.cpp diff --git a/src/Compression/examples/CMakeLists.txt b/src/Compression/examples/CMakeLists.txt index 7bf68e8845e..86fa5c3f78d 100644 --- a/src/Compression/examples/CMakeLists.txt +++ b/src/Compression/examples/CMakeLists.txt @@ -1,5 +1,2 @@ clickhouse_add_executable (compressed_buffer compressed_buffer.cpp) target_link_libraries (compressed_buffer PRIVATE dbms) - -clickhouse_add_executable (cached_compressed_read_buffer cached_compressed_read_buffer.cpp) -target_link_libraries (cached_compressed_read_buffer PRIVATE dbms) diff --git a/src/Compression/examples/cached_compressed_read_buffer.cpp b/src/Compression/examples/cached_compressed_read_buffer.cpp deleted file mode 100644 index a8e14ac7271..00000000000 --- a/src/Compression/examples/cached_compressed_read_buffer.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include -#include - -#include - - -int main(int argc, char ** argv) -{ - using namespace DB; - - if (argc < 2) - { - std::cerr << "Usage: program path\n"; - return 1; - } - - try - { - UncompressedCache cache("SLRU", 1024, 0.5); - std::string path = argv[1]; - - std::cerr << std::fixed << std::setprecision(3); - - size_t hits = 0; - size_t misses = 0; - - { - Stopwatch watch; - CachedCompressedReadBuffer in( - path, - [&]() - { - return createReadBufferFromFileBase(path, {}); - }, - &cache - ); - WriteBufferFromFile out("/dev/null"); - copyData(in, out); - - std::cerr << "Elapsed: " << watch.elapsedSeconds() << std::endl; - } - - cache.getStats(hits, misses); - std::cerr << "Hits: " << hits << ", misses: " << misses << std::endl; - - { - Stopwatch watch; - CachedCompressedReadBuffer in( - path, - [&]() - { - return createReadBufferFromFileBase(path, {}); - }, - &cache - ); - WriteBufferFromFile out("/dev/null"); - copyData(in, out); - - std::cerr << "Elapsed: " << watch.elapsedSeconds() << std::endl; - } - - cache.getStats(hits, misses); - std::cerr << "Hits: " << hits << ", misses: " << misses << std::endl; - } - catch (const Exception & e) - { - std::cerr << e.what() << ", " << e.displayText() << std::endl; - return 1; - } - - return 0; -} diff --git a/src/Compression/examples/compressed_buffer.cpp b/src/Compression/examples/compressed_buffer.cpp index 74646ff0f28..530f0938662 100644 --- a/src/Compression/examples/compressed_buffer.cpp +++ b/src/Compression/examples/compressed_buffer.cpp @@ -1,7 +1,4 @@ -#include - #include -#include #include #include From a231f7de9ae7b86c41204c774dda40c3fbe73ac6 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 06:55:09 +0100 Subject: [PATCH 0193/1081] Remove unneeded file --- src/Compression/examples/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Compression/examples/CMakeLists.txt b/src/Compression/examples/CMakeLists.txt index 86fa5c3f78d..a924075d0dc 100644 --- a/src/Compression/examples/CMakeLists.txt +++ b/src/Compression/examples/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable (compressed_buffer compressed_buffer.cpp) -target_link_libraries (compressed_buffer PRIVATE dbms) +target_link_libraries (compressed_buffer PRIVATE clickhouse_common_io) From 69cab686e4e418c0e7cfa74dfea459671d14bc49 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 10:03:31 +0100 Subject: [PATCH 0194/1081] Fix build of examples --- src/Compression/examples/CMakeLists.txt | 2 +- src/Core/examples/CMakeLists.txt | 3 - src/Core/examples/mysql_protocol.cpp | 390 ------------------ src/Interpreters/examples/CMakeLists.txt | 22 +- src/Interpreters/examples/hash_map.cpp | 1 - src/Interpreters/examples/hash_map_lookup.cpp | 2 - src/Processors/CMakeLists.txt | 4 - src/Processors/examples/CMakeLists.txt | 4 - .../examples/comma_separated_streams.cpp | 117 ------ src/Processors/examples/test_in | 8 - src/Storages/MergeTree/CMakeLists.txt | 3 - .../MergeTree/examples/CMakeLists.txt | 2 - .../examples/wal_action_metadata.cpp | 61 --- src/Storages/examples/CMakeLists.txt | 8 - src/Storages/examples/active_parts.py | 41 -- .../examples/async_read_buffer_from_hdfs.cpp | 37 -- ...get_abandonable_lock_in_all_partitions.cpp | 71 ---- 17 files changed, 12 insertions(+), 764 deletions(-) delete mode 100644 src/Core/examples/mysql_protocol.cpp delete mode 100644 src/Processors/examples/CMakeLists.txt delete mode 100644 src/Processors/examples/comma_separated_streams.cpp delete mode 100644 src/Processors/examples/test_in delete mode 100644 src/Storages/MergeTree/examples/CMakeLists.txt delete mode 100644 src/Storages/MergeTree/examples/wal_action_metadata.cpp delete mode 100644 src/Storages/examples/active_parts.py delete mode 100644 src/Storages/examples/async_read_buffer_from_hdfs.cpp delete mode 100644 src/Storages/examples/get_abandonable_lock_in_all_partitions.cpp diff --git a/src/Compression/examples/CMakeLists.txt b/src/Compression/examples/CMakeLists.txt index a924075d0dc..a7cc6bebf42 100644 --- a/src/Compression/examples/CMakeLists.txt +++ b/src/Compression/examples/CMakeLists.txt @@ -1,2 +1,2 @@ clickhouse_add_executable (compressed_buffer compressed_buffer.cpp) -target_link_libraries (compressed_buffer PRIVATE clickhouse_common_io) +target_link_libraries (compressed_buffer PRIVATE clickhouse_common_io clickhouse_compression) diff --git a/src/Core/examples/CMakeLists.txt b/src/Core/examples/CMakeLists.txt index 2326eada96d..f30ee25491f 100644 --- a/src/Core/examples/CMakeLists.txt +++ b/src/Core/examples/CMakeLists.txt @@ -6,6 +6,3 @@ target_link_libraries (field PRIVATE dbms) clickhouse_add_executable (string_ref_hash string_ref_hash.cpp) target_link_libraries (string_ref_hash PRIVATE clickhouse_common_io) - -clickhouse_add_executable (mysql_protocol mysql_protocol.cpp) -target_link_libraries (mysql_protocol PRIVATE dbms) diff --git a/src/Core/examples/mysql_protocol.cpp b/src/Core/examples/mysql_protocol.cpp deleted file mode 100644 index a6247418e87..00000000000 --- a/src/Core/examples/mysql_protocol.cpp +++ /dev/null @@ -1,390 +0,0 @@ -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -int main(int argc, char ** argv) -{ - using namespace DB; - using namespace MySQLProtocol; - using namespace MySQLProtocol::Generic; - using namespace MySQLProtocol::Authentication; - using namespace MySQLProtocol::ConnectionPhase; - using namespace MySQLProtocol::ProtocolText; - - - uint8_t server_sequence_id = 1; - uint8_t client_sequence_id = 1; - String user = "default"; - String password = "123"; - String database; - - UInt8 charset_utf8 = 33; - UInt32 max_packet_size = MAX_PACKET_LENGTH; - String mysql_native_password = "mysql_native_password"; - - UInt32 server_capability_flags = CLIENT_PROTOCOL_41 | CLIENT_SECURE_CONNECTION | CLIENT_PLUGIN_AUTH - | CLIENT_PLUGIN_AUTH_LENENC_CLIENT_DATA | CLIENT_CONNECT_WITH_DB | CLIENT_DEPRECATE_EOF; - - UInt32 client_capability_flags = CLIENT_PROTOCOL_41 | CLIENT_PLUGIN_AUTH | CLIENT_SECURE_CONNECTION; - - /// Handshake packet - { - /// 1. Greeting: - /// 1.1 Server writes greeting to client - std::string s0; - WriteBufferFromString out0(s0); - - Handshake server_handshake( - server_capability_flags, -1, "ClickHouse", "mysql_native_password", "aaaaaaaaaaaaaaaaaaaaa", CharacterSet::utf8_general_ci); - server_handshake.writePayload(out0, server_sequence_id); - - /// 1.2 Client reads the greeting - ReadBufferFromString in0(s0); - Handshake client_handshake; - client_handshake.readPayload(in0, client_sequence_id); - - /// Check packet - ASSERT(server_handshake.capability_flags == client_handshake.capability_flags) - ASSERT(server_handshake.status_flags == client_handshake.status_flags) - ASSERT(server_handshake.server_version == client_handshake.server_version) - ASSERT(server_handshake.protocol_version == client_handshake.protocol_version) - ASSERT(server_handshake.auth_plugin_data.substr(0, 20) == client_handshake.auth_plugin_data) - ASSERT(server_handshake.auth_plugin_name == client_handshake.auth_plugin_name) - - /// 2. Greeting Response: - std::string s1; - WriteBufferFromString out1(s1); - - /// 2.1 Client writes to server - Native41 native41(password, client_handshake.auth_plugin_data); - String auth_plugin_data = native41.getAuthPluginData(); - HandshakeResponse client_handshake_response( - client_capability_flags, max_packet_size, charset_utf8, user, database, auth_plugin_data, mysql_native_password); - client_handshake_response.writePayload(out1, client_sequence_id); - - /// 2.2 Server reads the response - ReadBufferFromString in1(s1); - HandshakeResponse server_handshake_response; - server_handshake_response.readPayload(in1, server_sequence_id); - - /// Check - ASSERT(server_handshake_response.capability_flags == client_handshake_response.capability_flags) - ASSERT(server_handshake_response.character_set == client_handshake_response.character_set) - ASSERT(server_handshake_response.username == client_handshake_response.username) - ASSERT(server_handshake_response.database == client_handshake_response.database) - ASSERT(server_handshake_response.auth_response == client_handshake_response.auth_response) - ASSERT(server_handshake_response.auth_plugin_name == client_handshake_response.auth_plugin_name) - } - - /// OK Packet - { - // 1. Server writes packet - std::string s0; - WriteBufferFromString out0(s0); - OKPacket server(0x00, server_capability_flags, 0, 0, 0, "", ""); - server.writePayload(out0, server_sequence_id); - - // 2. Client reads packet - ReadBufferFromString in0(s0); - ResponsePacket client(server_capability_flags); - client.readPayload(in0, client_sequence_id); - - // Check - ASSERT(client.getType() == PACKET_OK) - ASSERT(client.ok.header == server.header) - ASSERT(client.ok.status_flags == server.status_flags) - ASSERT(client.ok.capabilities == server.capabilities) - } - - /// ERR Packet - { - // 1. Server writes packet - std::string s0; - WriteBufferFromString out0(s0); - ERRPacket server(123, "12345", "This is the error message"); - server.writePayload(out0, server_sequence_id); - - // 2. Client reads packet - ReadBufferFromString in0(s0); - ResponsePacket client(server_capability_flags); - client.readPayload(in0, client_sequence_id); - - // Check - ASSERT(client.getType() == PACKET_ERR) - ASSERT(client.err.header == server.header) - ASSERT(client.err.error_code == server.error_code) - ASSERT(client.err.sql_state == server.sql_state) - ASSERT(client.err.error_message == server.error_message) - } - - /// EOF Packet - { - // 1. Server writes packet - std::string s0; - WriteBufferFromString out0(s0); - EOFPacket server(1, 1); - server.writePayload(out0, server_sequence_id); - - // 2. Client reads packet - ReadBufferFromString in0(s0); - ResponsePacket client(server_capability_flags); - client.readPayload(in0, client_sequence_id); - - // Check - ASSERT(client.getType() == PACKET_EOF) - ASSERT(client.eof.header == server.header) - ASSERT(client.eof.warnings == server.warnings) - ASSERT(client.eof.status_flags == server.status_flags) - } - - /// ColumnDefinition Packet - { - // 1. Server writes packet - std::string s0; - WriteBufferFromString out0(s0); - ColumnDefinition server("schema", "tbl", "org_tbl", "name", "org_name", 33, 0x00, MYSQL_TYPE_STRING, 0x00, 0x00); - server.writePayload(out0, server_sequence_id); - - // 2. Client reads packet - ReadBufferFromString in0(s0); - ColumnDefinition client; - client.readPayload(in0, client_sequence_id); - - // Check - ASSERT(client.column_type == server.column_type) - ASSERT(client.column_length == server.column_length) - ASSERT(client.next_length == server.next_length) - ASSERT(client.character_set == server.character_set) - ASSERT(client.decimals == server.decimals) - ASSERT(client.name == server.name) - ASSERT(client.org_name == server.org_name) - ASSERT(client.table == server.table) - ASSERT(client.org_table == server.org_table) - ASSERT(client.schema == server.schema) - } - - /// GTID sets tests. - { - struct Testcase - { - String name; - String sets; - String want; - }; - - Testcase cases[] = { - {"gtid-sets-without-whitespace", - "2c5adab4-d64a-11e5-82df-ac162d72dac0:1-247743812,9f58c169-d121-11e7-835b-ac162db9c048:1-56060985:56060987-56061175:56061177-" - "56061224:56061226-75201528:75201530-75201755:75201757-75201983:75201985-75407550:75407552-75407604:75407606-75407661:" - "75407663-87889848:87889850-87889935:87889937-87890042:87890044-88391955:88391957-88392125:88392127-88392245:88392247-" - "88755771:88755773-88755826:88755828-88755921:88755923-100279047:100279049-100279126:100279128-100279247:100279249-121672430:" - "121672432-121672503:121672505-121672524:121672526-122946019:122946021-122946291:122946293-122946469:122946471-134313284:" - "134313286-134313415:134313417-134313648:134313650-136492728:136492730-136492784:136492786-136492904:136492906-145582402:" - "145582404-145582439:145582441-145582463:145582465-147455222:147455224-147455262:147455264-147455277:147455279-149319049:" - "149319051-149319261:149319263-150635915,a6d83ff6-bfcf-11e7-8c93-246e96158550:1-126618302", - "2c5adab4-d64a-11e5-82df-ac162d72dac0:1-247743812,9f58c169-d121-11e7-835b-ac162db9c048:1-56060985:56060987-56061175:56061177-" - "56061224:56061226-75201528:75201530-75201755:75201757-75201983:75201985-75407550:75407552-75407604:75407606-75407661:" - "75407663-87889848:87889850-87889935:87889937-87890042:87890044-88391955:88391957-88392125:88392127-88392245:88392247-" - "88755771:88755773-88755826:88755828-88755921:88755923-100279047:100279049-100279126:100279128-100279247:100279249-121672430:" - "121672432-121672503:121672505-121672524:121672526-122946019:122946021-122946291:122946293-122946469:122946471-134313284:" - "134313286-134313415:134313417-134313648:134313650-136492728:136492730-136492784:136492786-136492904:136492906-145582402:" - "145582404-145582439:145582441-145582463:145582465-147455222:147455224-147455262:147455264-147455277:147455279-149319049:" - "149319051-149319261:149319263-150635915,a6d83ff6-bfcf-11e7-8c93-246e96158550:1-126618302"}, - - {"gtid-sets-with-whitespace", - "2c5adab4-d64a-11e5-82df-ac162d72dac0:1-247743812, 9f58c169-d121-11e7-835b-ac162db9c048:1-56060985:56060987-56061175:56061177", - "2c5adab4-d64a-11e5-82df-ac162d72dac0:1-247743812,9f58c169-d121-11e7-835b-ac162db9c048:1-56060985:56060987-56061175:56061177"}, - - {"gtid-sets-single", "2c5adab4-d64a-11e5-82df-ac162d72dac0:1-247743812", "2c5adab4-d64a-11e5-82df-ac162d72dac0:1-247743812"}}; - - for (auto & tc : cases) - { - GTIDSets gtid_sets; - gtid_sets.parse(tc.sets); - - String want = tc.want; - String got = gtid_sets.toString(); - ASSERT(want == got) - } - } - - { - struct Testcase - { - String name; - String gtid_sets; - String gtid_str; - String want; - }; - - Testcase cases[] = { - {"merge", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-2:4-7", - "10662d71-9d91-11ea-bbc2-0242ac110003:3", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-7"}, - - {"merge-front", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-2:5-7", - "10662d71-9d91-11ea-bbc2-0242ac110003:3", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-3:5-7"}, - - {"extend-interval", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-2:6-7", - "10662d71-9d91-11ea-bbc2-0242ac110003:4", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-2:4:6-7"}, - - {"extend-interval", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-2:4:7-9", - "10662d71-9d91-11ea-bbc2-0242ac110003:5", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-2:4-5:7-9"}, - - {"extend-interval", - "10662d71-9d91-11ea-bbc2-0242ac110003:6-7", - "10662d71-9d91-11ea-bbc2-0242ac110003:4", - "10662d71-9d91-11ea-bbc2-0242ac110003:4:6-7"}, - - {"extend-interval", - "10662d71-9d91-11ea-bbc2-0242ac110003:6-7", - "10662d71-9d91-11ea-bbc2-0242ac110003:9", - "10662d71-9d91-11ea-bbc2-0242ac110003:6-7:9"}, - - {"extend-interval", - "10662d71-9d91-11ea-bbc2-0242ac110003:6-7", - "20662d71-9d91-11ea-bbc2-0242ac110003:9", - "10662d71-9d91-11ea-bbc2-0242ac110003:6-7,20662d71-9d91-11ea-bbc2-0242ac110003:9"}, - - {"shrink-sequence", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-3:4-5:7", - "10662d71-9d91-11ea-bbc2-0242ac110003:6", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-7"}, - - {"shrink-sequence", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-3:4-5:10", - "10662d71-9d91-11ea-bbc2-0242ac110003:8", - "10662d71-9d91-11ea-bbc2-0242ac110003:1-5:8:10" - } - }; - - for (auto & tc : cases) - { - GTIDSets gtid_sets; - gtid_sets.parse(tc.gtid_sets); - ASSERT(tc.gtid_sets == gtid_sets.toString()) - - GTIDSets gtid_sets1; - gtid_sets1.parse(tc.gtid_str); - - GTID gtid; - gtid.uuid = gtid_sets1.sets[0].uuid; - gtid.seq_no = gtid_sets1.sets[0].intervals[0].start; - gtid_sets.update(gtid); - - String want = tc.want; - String got = gtid_sets.toString(); - ASSERT(want == got) - } - } - - { - /// mysql_protocol --host=172.17.0.3 --user=root --password=123 --db=sbtest - try - { - boost::program_options::options_description desc("Allowed options"); - desc.add_options()("host", boost::program_options::value()->required(), "master host")( - "port", boost::program_options::value()->default_value(3306), "master port")( - "user", boost::program_options::value()->default_value("root"), "master user")( - "password", boost::program_options::value()->required(), "master password")( - "gtid", boost::program_options::value()->default_value(""), "executed GTID sets")( - "db", boost::program_options::value()->required(), "replicate do db")( - "binlog_checksum", boost::program_options::value()->default_value("CRC32"), "master binlog_checksum"); - - boost::program_options::variables_map options; - boost::program_options::store(boost::program_options::parse_command_line(argc, argv, desc), options); - if (argc == 0) - { - return 1; - } - - auto host = options.at("host").as(); - auto port = options.at("port").as(); - auto master_user = options.at("user").as(); - auto master_password = options.at("password").as(); - auto gtid_sets = options.at("gtid").as(); - auto replicate_db = options.at("db").as(); - auto binlog_checksum = options.at("binlog_checksum").as(); - - std::cerr << "Master Host: " << host << ", Port: " << port << ", User: " << master_user << ", Password: " << master_password - << ", Replicate DB: " << replicate_db << ", GTID: " << gtid_sets << std::endl; - - UInt32 slave_id = 9004; - MySQLClient slave(host, port, master_user, master_password); - - /// Connect to the master. - slave.connect(); - slave.startBinlogDumpGTID(slave_id, replicate_db, {}, gtid_sets, binlog_checksum); - - WriteBufferFromOStream cerr(std::cerr); - - /// Read one binlog event on by one. - while (true) - { - auto event = slave.readOneBinlogEvent(); - switch (event->type()) - { - case MYSQL_QUERY_EVENT: { - auto binlog_event = std::static_pointer_cast(event); - binlog_event->dump(cerr); - - Position pos = slave.getPosition(); - pos.dump(cerr); - break; - } - case MYSQL_WRITE_ROWS_EVENT: { - auto binlog_event = std::static_pointer_cast(event); - binlog_event->dump(cerr); - - Position pos = slave.getPosition(); - pos.dump(cerr); - break; - } - case MYSQL_UPDATE_ROWS_EVENT: { - auto binlog_event = std::static_pointer_cast(event); - binlog_event->dump(cerr); - - Position pos = slave.getPosition(); - pos.dump(cerr); - break; - } - case MYSQL_DELETE_ROWS_EVENT: { - auto binlog_event = std::static_pointer_cast(event); - binlog_event->dump(cerr); - - Position pos = slave.getPosition(); - pos.dump(cerr); - break; - } - default: - if (event->header.type != MySQLReplication::EventType::HEARTBEAT_EVENT) - { - event->dump(cerr); - } - break; - } - } - } - catch (const Exception & ex) - { - std::cerr << "Error: " << ex.message() << std::endl; - return 1; - } - } -} diff --git a/src/Interpreters/examples/CMakeLists.txt b/src/Interpreters/examples/CMakeLists.txt index 11c219ff64e..8bb7f9eeb98 100644 --- a/src/Interpreters/examples/CMakeLists.txt +++ b/src/Interpreters/examples/CMakeLists.txt @@ -1,35 +1,35 @@ clickhouse_add_executable (hash_map hash_map.cpp) -target_link_libraries (hash_map PRIVATE dbms ch_contrib::sparsehash) +target_link_libraries (hash_map PRIVATE dbms clickhouse_functions ch_contrib::sparsehash) clickhouse_add_executable (hash_map_lookup hash_map_lookup.cpp) -target_link_libraries (hash_map_lookup PRIVATE dbms) +target_link_libraries (hash_map_lookup PRIVATE clickhouse_common_io clickhouse_compression) clickhouse_add_executable (hash_map3 hash_map3.cpp) -target_link_libraries (hash_map3 PRIVATE dbms ch_contrib::farmhash ch_contrib::metrohash) +target_link_libraries (hash_map3 PRIVATE clickhouse_common_io clickhouse_compression ch_contrib::farmhash ch_contrib::metrohash) clickhouse_add_executable (hash_map_string hash_map_string.cpp) -target_link_libraries (hash_map_string PRIVATE dbms ch_contrib::sparsehash) +target_link_libraries (hash_map_string PRIVATE clickhouse_common_io clickhouse_compression ch_contrib::sparsehash) clickhouse_add_executable (hash_map_string_2 hash_map_string_2.cpp) -target_link_libraries (hash_map_string_2 PRIVATE dbms) +target_link_libraries (hash_map_string_2 PRIVATE clickhouse_common_io clickhouse_compression) clickhouse_add_executable (hash_map_string_3 hash_map_string_3.cpp) -target_link_libraries (hash_map_string_3 PRIVATE dbms ch_contrib::farmhash ch_contrib::metrohash) +target_link_libraries (hash_map_string_3 PRIVATE clickhouse_common_io clickhouse_compression ch_contrib::farmhash ch_contrib::metrohash) clickhouse_add_executable (hash_map_string_small hash_map_string_small.cpp) -target_link_libraries (hash_map_string_small PRIVATE dbms ch_contrib::sparsehash) +target_link_libraries (hash_map_string_small PRIVATE clickhouse_common_io clickhouse_compression ch_contrib::sparsehash) clickhouse_add_executable (string_hash_map string_hash_map.cpp) -target_link_libraries (string_hash_map PRIVATE dbms ch_contrib::sparsehash) +target_link_libraries (string_hash_map PRIVATE clickhouse_common_io clickhouse_compression ch_contrib::sparsehash) clickhouse_add_executable (string_hash_map_aggregation string_hash_map.cpp) -target_link_libraries (string_hash_map_aggregation PRIVATE dbms) +target_link_libraries (string_hash_map_aggregation PRIVATE clickhouse_common_io clickhouse_compression) clickhouse_add_executable (string_hash_set string_hash_set.cpp) -target_link_libraries (string_hash_set PRIVATE dbms) +target_link_libraries (string_hash_set PRIVATE clickhouse_common_io clickhouse_compression) clickhouse_add_executable (two_level_hash_map two_level_hash_map.cpp) -target_link_libraries (two_level_hash_map PRIVATE dbms ch_contrib::sparsehash) +target_link_libraries (two_level_hash_map PRIVATE clickhouse_common_io clickhouse_compression ch_contrib::sparsehash) clickhouse_add_executable (jit_example jit_example.cpp) target_link_libraries (jit_example PRIVATE dbms) diff --git a/src/Interpreters/examples/hash_map.cpp b/src/Interpreters/examples/hash_map.cpp index b55f174678e..0a91d00809f 100644 --- a/src/Interpreters/examples/hash_map.cpp +++ b/src/Interpreters/examples/hash_map.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Interpreters/examples/hash_map_lookup.cpp b/src/Interpreters/examples/hash_map_lookup.cpp index fd6b231cf73..829a234c537 100644 --- a/src/Interpreters/examples/hash_map_lookup.cpp +++ b/src/Interpreters/examples/hash_map_lookup.cpp @@ -8,10 +8,8 @@ #define DBMS_HASH_MAP_DEBUG_RESIZES #include -#include #include #include -#include #include #include diff --git a/src/Processors/CMakeLists.txt b/src/Processors/CMakeLists.txt index 7e965188b4c..e69de29bb2d 100644 --- a/src/Processors/CMakeLists.txt +++ b/src/Processors/CMakeLists.txt @@ -1,4 +0,0 @@ -if (ENABLE_EXAMPLES) - add_subdirectory(examples) -endif () - diff --git a/src/Processors/examples/CMakeLists.txt b/src/Processors/examples/CMakeLists.txt deleted file mode 100644 index 5d43a0d7d08..00000000000 --- a/src/Processors/examples/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -if (TARGET ch_contrib::hivemetastore) - clickhouse_add_executable (comma_separated_streams comma_separated_streams.cpp) - target_link_libraries (comma_separated_streams PRIVATE dbms) -endif() diff --git a/src/Processors/examples/comma_separated_streams.cpp b/src/Processors/examples/comma_separated_streams.cpp deleted file mode 100644 index 2ec5564f346..00000000000 --- a/src/Processors/examples/comma_separated_streams.cpp +++ /dev/null @@ -1,117 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using namespace DB; - -int main() -try -{ - Block sample; - { - // a - ColumnWithTypeAndName col; - col.name = "a"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // b - ColumnWithTypeAndName col; - col.name = "b"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // c - ColumnWithTypeAndName col; - col.name = "c"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // d - ColumnWithTypeAndName col; - col.name = "d"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // e - ColumnWithTypeAndName col; - col.name = "e"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // f - ColumnWithTypeAndName col; - col.name = "f"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // g - ColumnWithTypeAndName col; - col.name = "g"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - { - // h - ColumnWithTypeAndName col; - col.name = "h"; - col.type = std::make_shared(); - sample.insert(std::move(col)); - } - - - ReadBufferFromFile in_buf("test_in"); - WriteBufferFromFile out_buf("test_out"); - - FormatSettings format_settings; - format_settings.with_names_use_header = true; - format_settings.skip_unknown_fields = true; - format_settings.csv.delimiter = '\x01'; - format_settings.hive_text.input_field_names = - { - "d", - "e", - "f", - "a", - "b", - "c", - "g", - "h", - "i", - "j", - }; - - RowInputFormatParams in_params{DEFAULT_INSERT_BLOCK_SIZE}; - InputFormatPtr input_format = std::make_shared(sample, in_buf, in_params, format_settings); - auto pipeline = QueryPipeline(std::move(input_format)); - auto reader = std::make_unique(pipeline); - - OutputFormatPtr output_format = std::make_shared(out_buf, sample, true, true, format_settings); - Block res; - while (reader->pull(res)) - { - output_format->write(res); - } - return 0; -} -catch (...) -{ - std::cerr << getCurrentExceptionMessage(true) << '\n'; - return 1; -} diff --git a/src/Processors/examples/test_in b/src/Processors/examples/test_in deleted file mode 100644 index c7df97a26a6..00000000000 --- a/src/Processors/examples/test_in +++ /dev/null @@ -1,8 +0,0 @@ -2021-09-14JPall20.0200 -2021-09-14CIall20.0100 -2021-09-14JMall40.25411 -2021-09-14MMall310.19354838709677422766 -2021-09-14TZAndroid30.3333333333333333311 -2021-09-14SGall80.25412 -2021-09-14PYall11.0001 -2021-09-14MXall10.0100 diff --git a/src/Storages/MergeTree/CMakeLists.txt b/src/Storages/MergeTree/CMakeLists.txt index 390835f17ae..e69de29bb2d 100644 --- a/src/Storages/MergeTree/CMakeLists.txt +++ b/src/Storages/MergeTree/CMakeLists.txt @@ -1,3 +0,0 @@ -if(ENABLE_EXAMPLES) - add_subdirectory(examples) -endif() diff --git a/src/Storages/MergeTree/examples/CMakeLists.txt b/src/Storages/MergeTree/examples/CMakeLists.txt deleted file mode 100644 index 25bba7ae0b4..00000000000 --- a/src/Storages/MergeTree/examples/CMakeLists.txt +++ /dev/null @@ -1,2 +0,0 @@ -clickhouse_add_executable (wal_action_metadata wal_action_metadata.cpp) -target_link_libraries (wal_action_metadata PRIVATE dbms) diff --git a/src/Storages/MergeTree/examples/wal_action_metadata.cpp b/src/Storages/MergeTree/examples/wal_action_metadata.cpp deleted file mode 100644 index 03c38c7a186..00000000000 --- a/src/Storages/MergeTree/examples/wal_action_metadata.cpp +++ /dev/null @@ -1,61 +0,0 @@ -#include - -#include -#include - -namespace DB -{ -namespace ErrorCodes -{ - extern const int UNKNOWN_FORMAT_VERSION; -} -} - -int main(int, char **) -{ - try - { - { - std::cout << "test: dummy test" << std::endl; - - DB::MergeTreeWriteAheadLog::ActionMetadata metadata_out; - DB::MemoryWriteBuffer buf{}; - - metadata_out.write(buf); - buf.finalize(); - - metadata_out.read(*buf.tryGetReadBuffer()); - } - - { - std::cout << "test: min compatibility" << std::endl; - - DB::MergeTreeWriteAheadLog::ActionMetadata metadata_out; - metadata_out.min_compatible_version = DB::MergeTreeWriteAheadLog::WAL_VERSION + 1; - DB::MemoryWriteBuffer buf{}; - - metadata_out.write(buf); - buf.finalize(); - - try - { - metadata_out.read(*buf.tryGetReadBuffer()); - } - catch (const DB::Exception & e) - { - if (e.code() != DB::ErrorCodes::UNKNOWN_FORMAT_VERSION) - { - std::cerr << "Expected UNKNOWN_FORMAT_VERSION exception but got: " - << e.what() << ", " << e.displayText() << std::endl; - } - } - } - } - catch (const DB::Exception & e) - { - std::cerr << e.what() << ", " << e.displayText() << std::endl; - return 1; - } - - return 0; -} diff --git a/src/Storages/examples/CMakeLists.txt b/src/Storages/examples/CMakeLists.txt index 225337d8ec8..cddfc9404d4 100644 --- a/src/Storages/examples/CMakeLists.txt +++ b/src/Storages/examples/CMakeLists.txt @@ -6,11 +6,3 @@ target_link_libraries (merge_selector2 PRIVATE dbms) clickhouse_add_executable (get_current_inserts_in_replicated get_current_inserts_in_replicated.cpp) target_link_libraries (get_current_inserts_in_replicated PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper string_utils) - -clickhouse_add_executable (get_abandonable_lock_in_all_partitions get_abandonable_lock_in_all_partitions.cpp) -target_link_libraries (get_abandonable_lock_in_all_partitions PRIVATE dbms clickhouse_common_config clickhouse_common_zookeeper) - -if (TARGET ch_contrib::hdfs) - clickhouse_add_executable (async_read_buffer_from_hdfs async_read_buffer_from_hdfs.cpp) - target_link_libraries (async_read_buffer_from_hdfs PRIVATE dbms ch_contrib::hdfs) -endif () diff --git a/src/Storages/examples/active_parts.py b/src/Storages/examples/active_parts.py deleted file mode 100644 index d82c5ca96bf..00000000000 --- a/src/Storages/examples/active_parts.py +++ /dev/null @@ -1,41 +0,0 @@ -#!/usr/bin/python -# coding=UTF-8 - -# Displays a list of active parts - parts that are not overlapped by any other part. -# Usage: `ls /var/lib/clickhouse/data/merge/visits | active_parts.py` - -import sys -import re - -parts = {} -for s in sys.stdin.read().split(): - m = re.match( - "^([0-9]{6})[0-9]{2}_([0-9]{6})[0-9]{2}_([0-9]+)_([0-9]+)_([0-9]+)$", s - ) - if m == None: - continue - m1 = m.group(1) - m2 = m.group(2) - i1 = int(m.group(3)) - i2 = int(m.group(4)) - l = int(m.group(5)) - if m1 != m2: - raise Exception("not in single month: " + s) - if m1 not in parts: - parts[m1] = [] - parts[m1].append((i1, i2, l, s)) - -for m, ps in sorted(parts.items()): - ps.sort(key=lambda i1_i2_l_s: (i1_i2_l_s[0], -i1_i2_l_s[1], -i1_i2_l_s[2])) - (x2, y2, l2, s2) = (-1, -1, -1, -1) - for x1, y1, l1, s1 in ps: - if x1 >= x2 and y1 <= y2 and l1 < l2 and (x1, y1) != (x2, y2): # 2 contains 1 - pass - elif x1 > y2: # 1 is to the right of 2 - if x1 != y2 + 1 and y2 != -1: - print() # to see the missing numbers - (x2, y2, l2, s2) = (x1, y1, l1, s1) - print(s1) - else: - raise Exception("invalid parts intersection: " + s1 + " and " + s2) - print() diff --git a/src/Storages/examples/async_read_buffer_from_hdfs.cpp b/src/Storages/examples/async_read_buffer_from_hdfs.cpp deleted file mode 100644 index 4f6aed8ef65..00000000000 --- a/src/Storages/examples/async_read_buffer_from_hdfs.cpp +++ /dev/null @@ -1,37 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -int main() -{ - using namespace DB; - namespace fs = std::filesystem; - - String config_path = "/path/to/config/file"; - ConfigProcessor config_processor(config_path, false, true); - config_processor.setConfigPath(fs::path(config_path).parent_path()); - auto loaded_config = config_processor.loadConfig(false); - auto * config = loaded_config.configuration.duplicate(); - - String hdfs_namenode_url = "hdfs://namenode:port/"; - String path = "/path/to/hdfs/file"; - ReadSettings settings = {}; - auto in = std::make_unique(hdfs_namenode_url, path, *config, settings); - auto & reader = getThreadPoolReader(FilesystemReaderType::ASYNCHRONOUS_REMOTE_FS_READER); - AsynchronousReadBufferFromHDFS buf(reader, {}, std::move(in)); - - String output; - WriteBufferFromString out(output); - copyData(buf, out); - std::cout << "output:" << output << std::endl; - return 0; -} diff --git a/src/Storages/examples/get_abandonable_lock_in_all_partitions.cpp b/src/Storages/examples/get_abandonable_lock_in_all_partitions.cpp deleted file mode 100644 index 4607d68f02d..00000000000 --- a/src/Storages/examples/get_abandonable_lock_in_all_partitions.cpp +++ /dev/null @@ -1,71 +0,0 @@ -#include -#include -#include -#include -#include - -#include - -#include - - -using namespace DB; - -/// This test is useful for assessing the performance of acquiring block numbers in all partitions (and there -/// can be ~1000 of them). This is needed when creating a mutation entry for a ReplicatedMergeTree table. -int main(int argc, char ** argv) -try -{ - if (argc != 3) - { - std::cerr << "usage: " << argv[0] << " " << std::endl; - return 3; - } - - ConfigProcessor processor(argv[1], false, true); - auto config = processor.loadConfig().configuration; - String root_path = argv[2]; - - auto zk = zkutil::ZooKeeper::createWithoutKillingPreviousSessions(*config, zkutil::getZooKeeperConfigName(*config), nullptr); - - String temp_path = root_path + "/temp"; - String blocks_path = root_path + "/block_numbers"; - - Stopwatch total_timer; - Stopwatch timer; - - EphemeralLocksInAllPartitions locks(blocks_path, "test_lock-", temp_path, *zk); - - std::cerr << "Locked, elapsed: " << timer.elapsedSeconds() << std::endl; - for (const auto & lock : locks.getLocks()) - std::cout << lock.partition_id << " " << lock.number << std::endl; - timer.restart(); - - locks.unlock(); - std::cerr << "Abandoned, elapsed: " << timer.elapsedSeconds() << std::endl; - - std::cerr << "Total elapsed: " << total_timer.elapsedSeconds() << std::endl; - - return 0; -} -catch (const Exception & e) -{ - std::cerr << e.what() << ", " << e.displayText() << ": " << std::endl - << e.getStackTraceString() << std::endl; - throw; -} -catch (Poco::Exception & e) -{ - std::cerr << "Exception: " << e.displayText() << std::endl; - throw; -} -catch (std::exception & e) -{ - std::cerr << "std::exception: " << e.what() << std::endl; - throw; -} -catch (...) -{ - std::cerr << "Some exception" << std::endl; - throw; -} From adf056b54ad35638245c1a5c30bc57c4cb17fc6c Mon Sep 17 00:00:00 2001 From: Dan Wu Date: Sun, 10 Mar 2024 20:42:41 +0800 Subject: [PATCH 0195/1081] Update check-large-objects.sh to be language neutral The previous implementation uses `total` keyword to filter out lines that contains the total size of objects under a directory. But when the OS uses other language, it fails to filter out that line, so the script would fail. This implementation changes the script to match only lines that contains object file by their extension. This implementation would be language neutral. --- utils/check-style/check-large-objects.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh index 5c1276e5732..04f4e0c3171 100755 --- a/utils/check-style/check-large-objects.sh +++ b/utils/check-style/check-large-objects.sh @@ -12,7 +12,7 @@ TU_EXCLUDES=( Aggregator ) -if find $1 -name '*.o' | xargs wc -c | grep -v total | sort -rn | awk '{ if ($1 > 50000000) print }' \ +if find $1 -name '*.o' | xargs wc -c | grep --regexp='.o$' | sort -rn | awk '{ if ($1 > 50000000) print }' \ | grep -v -f <(printf "%s\n" "${TU_EXCLUDES[@]}") then echo "^ It's not allowed to have so large translation units." From 530efbe8102023f857ec29b1f9c7f2090973f736 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 10 Mar 2024 13:08:50 +0000 Subject: [PATCH 0196/1081] Beautify exception, pt. II --- src/Functions/array/arrayDistance.cpp | 6 +++--- src/Functions/array/arrayDotProduct.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 8b591e37ff6..69e5e3712dd 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -357,7 +357,7 @@ public: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Arguments of function {} has nested type {}. " - "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + "Supported types: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", getName(), common_type->getName()); } @@ -412,7 +412,7 @@ private: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Arguments of function {} has nested type {}. " - "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + "Supported types: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", getName(), type_x->getName()); } @@ -437,7 +437,7 @@ private: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Arguments of function {} has nested type {}. " - "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + "Supported types: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", getName(), type_y->getName()); } diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index 3f37e6f609f..a9547ca90bb 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -237,7 +237,7 @@ private: throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Arguments of function {} has nested type {}. " - "Support: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", + "Supported types: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64, Float32, Float64.", getName(), type_y->getName()); } From bcd705517a510feffd08e9e27c93c2e09eecd144 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 10 Mar 2024 12:53:58 +0000 Subject: [PATCH 0197/1081] Remove unnecessary call to convertToFullColumnIfConst() --- src/Functions/array/arrayDistance.cpp | 3 --- src/Functions/array/arrayDotProduct.cpp | 3 --- src/Functions/array/arrayNorm.cpp | 3 +-- 3 files changed, 1 insertion(+), 8 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 69e5e3712dd..6ed4bf24f99 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -455,9 +455,6 @@ private: return executeWithLeftArgConst(col_y, col_x, input_rows_count, arguments); } - col_x = col_x->convertToFullColumnIfConst(); - col_y = col_y->convertToFullColumnIfConst(); - const auto & array_x = *assert_cast(col_x.get()); const auto & array_y = *assert_cast(col_y.get()); diff --git a/src/Functions/array/arrayDotProduct.cpp b/src/Functions/array/arrayDotProduct.cpp index a9547ca90bb..783843a89d5 100644 --- a/src/Functions/array/arrayDotProduct.cpp +++ b/src/Functions/array/arrayDotProduct.cpp @@ -255,9 +255,6 @@ private: return executeWithLeftArgConst(col_y, col_x, input_rows_count); } - col_x = col_x->convertToFullColumnIfConst(); - col_y = col_y->convertToFullColumnIfConst(); - const auto & array_x = *assert_cast(col_x.get()); const auto & array_y = *assert_cast(col_y.get()); diff --git a/src/Functions/array/arrayNorm.cpp b/src/Functions/array/arrayNorm.cpp index 027a33d094c..e87eff6add1 100644 --- a/src/Functions/array/arrayNorm.cpp +++ b/src/Functions/array/arrayNorm.cpp @@ -175,8 +175,7 @@ public: } } - ColumnPtr - executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t input_rows_count) const override { DataTypePtr type = typeid_cast(arguments[0].type.get())->getNestedType(); ColumnPtr column = arguments[0].column->convertToFullColumnIfConst(); From 349894fc2b73fad70910c325e0de053a56ec1bc7 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 8 Mar 2024 11:27:09 +0000 Subject: [PATCH 0198/1081] Fixes #61051 --- docs/en/operations/settings/settings.md | 12 +++++ .../functions/string-search-functions.md | 18 ++++++- src/Core/Settings.h | 1 + src/Core/SettingsChangesHistory.h | 1 + src/Functions/FunctionsStringSearch.h | 53 ++++++++++++++----- .../array/FunctionsMapMiscellaneous.cpp | 1 + src/Functions/like.cpp | 2 +- src/Functions/locate.cpp | 34 ++++++++++++ src/Functions/position.cpp | 1 - .../0_stateless/00765_locate.reference | 7 +++ tests/queries/0_stateless/00765_locate.sql | 15 ++++++ .../00765_sql_compatibility_aliases.reference | 1 - .../00765_sql_compatibility_aliases.sql | 1 - ..._case_insensitive_function_names.reference | 1 - ...malize_case_insensitive_function_names.sql | 2 +- 15 files changed, 130 insertions(+), 20 deletions(-) create mode 100644 src/Functions/locate.cpp create mode 100644 tests/queries/0_stateless/00765_locate.reference create mode 100644 tests/queries/0_stateless/00765_locate.sql diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index 622644a1543..e75435669fb 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -4336,6 +4336,18 @@ Possible values: Default value: `0`. + +## function_locate_has_mysql_compatible_argument_order {#function-locate-has-mysql-compatible-argument-order} + +Controls the order of arguments in function [locate](../../sql-reference/functions/string-search-functions.md#locate). + +Possible values: + +- 0 — Function `locate` accepts arguments `(haystack, needle[, start_pos])`. +- 1 — Function `locate` accepts arguments `(needle, haystack, [, start_pos])` (MySQL-compatible behavior) + +Default value: `1`. + ## date_time_overflow_behavior {#date_time_overflow_behavior} Defines the behavior when [Date](../../sql-reference/data-types/date.md), [Date32](../../sql-reference/data-types/date32.md), [DateTime](../../sql-reference/data-types/datetime.md), [DateTime64](../../sql-reference/data-types/datetime64.md) or integers are converted into Date, Date32, DateTime or DateTime64 but the value cannot be represented in the result type. diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md index 22f879c62ae..1b03f220db2 100644 --- a/docs/en/sql-reference/functions/string-search-functions.md +++ b/docs/en/sql-reference/functions/string-search-functions.md @@ -30,7 +30,6 @@ position(haystack, needle[, start_pos]) Alias: - `position(needle IN haystack)` -- `locate(haystack, needle[, start_pos])`. **Arguments** @@ -49,7 +48,7 @@ If substring `needle` is empty, these rules apply: - if `start_pos >= 1` and `start_pos <= length(haystack) + 1`: return `start_pos` - otherwise: return `0` -The same rules also apply to functions `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8` +The same rules also apply to functions `locate`, `positionCaseInsensitive`, `positionUTF8` and `positionCaseInsensitiveUTF8`. Type: `Integer`. @@ -114,6 +113,21 @@ SELECT └─────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┴────────────────────────┘ ``` +## locate + +Like [position](#position) but with arguments `haystack` and `locate` switched. + +The behavior of this function depends on the ClickHouse version: +- in versions < v24.3, `locate` was an alias of function `position` and accepted arguments `(haystack, needle[, start_pos])`. +- in versions >= 24.3,, `locate` is an individual function (for better compatibility with MySQL) and accepts arguments `(needle, haystack[, start_pos])`. The previous behavior + can be restored using setting [function_locate_has_mysql_compatible_argument_order = false](../../operations/settings/settings.md#function-locate-has-mysql-compatible-argument-order); + +**Syntax** + +``` sql +locate(needle, haystack[, start_pos]) +``` + ## positionCaseInsensitive Like [position](#position) but searches case-insensitively. diff --git a/src/Core/Settings.h b/src/Core/Settings.h index c8bdb515baf..185fa99e062 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -175,6 +175,7 @@ class IColumn; M(Bool, enable_positional_arguments, true, "Enable positional arguments in ORDER BY, GROUP BY and LIMIT BY", 0) \ M(Bool, enable_extended_results_for_datetime_functions, false, "Enable date functions like toLastDayOfMonth return Date32 results (instead of Date results) for Date32/DateTime64 arguments.", 0) \ M(Bool, allow_nonconst_timezone_arguments, false, "Allow non-const timezone arguments in certain time-related functions like toTimeZone(), fromUnixTimestamp*(), snowflakeToDateTime*()", 0) \ + M(Bool, function_locate_has_mysql_compatible_argument_order, true, "Function locate() has arguments (needle, haystack[, start_pos]) like in MySQL instead of (haystack, needle[, start_pos]) like function position()", 0) \ \ M(Bool, group_by_use_nulls, false, "Treat columns mentioned in ROLLUP, CUBE or GROUPING SETS as Nullable", 0) \ \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 2f1da7935e6..1c5e8ab3818 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -92,6 +92,7 @@ static std::map sett {"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"}, {"throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert", false, true, "Deduplication is dependent materialized view cannot work together with async inserts."}, {"parallel_replicas_allow_in_with_subquery", false, true, "If true, subquery for IN will be executed on every follower replica"}, + {"function_locate_has_mysql_compatible_argument_order", false, true, "Increase compatibility with MySQL's locate function."}, }}, {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"}, {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"}, diff --git a/src/Functions/FunctionsStringSearch.h b/src/Functions/FunctionsStringSearch.h index 41b476ccc56..53d99198134 100644 --- a/src/Functions/FunctionsStringSearch.h +++ b/src/Functions/FunctionsStringSearch.h @@ -22,13 +22,13 @@ namespace DB * positionCaseInsensitive(haystack, needle) * positionCaseInsensitiveUTF8(haystack, needle) * - * like(haystack, pattern) - search by the regular expression LIKE; Returns 0 or 1. Case-insensitive, but only for Latin. - * notLike(haystack, pattern) + * like(haystack, needle) - search by the regular expression LIKE; Returns 0 or 1. Case-insensitive, but only for Latin. + * notLike(haystack, needle) * - * ilike(haystack, pattern) - like 'like' but case-insensitive - * notIlike(haystack, pattern) + * ilike(haystack, needle) - like 'like' but case-insensitive + * notIlike(haystack, needle) * - * match(haystack, pattern) - search by regular expression re2; Returns 0 or 1. + * match(haystack, needle) - search by regular expression re2; Returns 0 or 1. * * countSubstrings(haystack, needle) -- count number of occurrences of needle in haystack. * countSubstringsCaseInsensitive(haystack, needle) @@ -53,7 +53,7 @@ namespace DB * - the first subpattern, if the regexp has a subpattern; * - the zero subpattern (the match part, otherwise); * - if not match - an empty string. - * extract(haystack, pattern) + * extract(haystack, needle) */ namespace ErrorCodes @@ -69,13 +69,39 @@ enum class ExecutionErrorPolicy Throw }; -template +enum class HaystackNeedleOrderIsConfigurable +{ + No, /// function arguments are always: (haystack, needle[, position]) + Yes /// depending on a setting, the function arguments are (haystack, needle[, position]) or (needle, haystack[, position]) +}; + +template class FunctionsStringSearch : public IFunction { +private: + enum class ArgumentOrder + { + HaystackNeedle, + NeedleHaystack + }; + + ArgumentOrder argument_order = ArgumentOrder::HaystackNeedle; + public: static constexpr auto name = Impl::name; - static FunctionPtr create(ContextPtr) { return std::make_shared(); } + static FunctionPtr create(ContextPtr context) { return std::make_shared(context); } + + explicit FunctionsStringSearch([[maybe_unused]] ContextPtr context) + { + if constexpr (haystack_needle_order_is_configurable == HaystackNeedleOrderIsConfigurable::Yes) + { + if (context->getSettingsRef().function_locate_has_mysql_compatible_argument_order) + argument_order = ArgumentOrder::NeedleHaystack; + } + } String getName() const override { return name; } @@ -105,13 +131,16 @@ public: "Number of arguments for function {} doesn't match: passed {}, should be 2 or 3", getName(), arguments.size()); - if (!isStringOrFixedString(arguments[0])) + const auto & haystack_type = (argument_order == ArgumentOrder::HaystackNeedle) ? arguments[0] : arguments[1]; + const auto & needle_type = (argument_order == ArgumentOrder::HaystackNeedle) ? arguments[1] : arguments[0]; + + if (!isStringOrFixedString(haystack_type)) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName()); - if (!isString(arguments[1])) + if (!isString(needle_type)) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", @@ -135,8 +164,8 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override { - const ColumnPtr & column_haystack = arguments[0].column; - const ColumnPtr & column_needle = arguments[1].column; + const ColumnPtr & column_haystack = (argument_order == ArgumentOrder::HaystackNeedle) ? arguments[0].column : arguments[1].column; + const ColumnPtr & column_needle = (argument_order == ArgumentOrder::HaystackNeedle) ? arguments[1].column : arguments[0].column; ColumnPtr column_start_pos = nullptr; if (arguments.size() >= 3) diff --git a/src/Functions/array/FunctionsMapMiscellaneous.cpp b/src/Functions/array/FunctionsMapMiscellaneous.cpp index 157f2fa8a26..d92bfcf0bc6 100644 --- a/src/Functions/array/FunctionsMapMiscellaneous.cpp +++ b/src/Functions/array/FunctionsMapMiscellaneous.cpp @@ -213,6 +213,7 @@ struct MapToSubcolumnAdapter : public MapAdapterBase, ExecutionErrorPolicy::Throw, HaystackNeedleOrderIsConfigurable::Yes>; + +} + +REGISTER_FUNCTION(Locate) +{ + FunctionDocumentation::Description doc_description = "Like function `position` but with arguments `haystack` and `locate` switched. The behavior of this function depends on the ClickHouse version: In versions < v24.3, `locate` was an alias of function `position` and accepted arguments `(haystack, needle[, start_pos])`. In versions >= 24.3,, `locate` is an individual function (for better compatibility with MySQL) and accepts arguments `(needle, haystack[, start_pos])`. The previous behaviorcan be restored using setting `function_locate_has_mysql_compatible_argument_order = false`."; + FunctionDocumentation::Syntax doc_syntax = "location(needle, haystack[, start_pos])"; + FunctionDocumentation::Arguments doc_arguments = {{"needle", "Substring to be searched (String)"}, + {"haystack", "String in which the search is performed (String)."}, + {"start_pos", "Position (1-based) in `haystack` at which the search starts (UInt*)."}}; + FunctionDocumentation::ReturnedValue doc_returned_value = "Starting position in bytes and counting from 1, if the substring was found. 0, if the substring was not found."; + FunctionDocumentation::Examples doc_examples = {{"Example", "SELECT locate('abcabc', 'ca');", "3"}}; + FunctionDocumentation::Categories doc_categories = {"String search"}; + + + factory.registerFunction({doc_description, doc_syntax, doc_arguments, doc_returned_value, doc_examples, doc_categories}, FunctionFactory::CaseInsensitive); +} +} diff --git a/src/Functions/position.cpp b/src/Functions/position.cpp index 409a593b44c..29a5db2eb24 100644 --- a/src/Functions/position.cpp +++ b/src/Functions/position.cpp @@ -20,6 +20,5 @@ using FunctionPosition = FunctionsStringSearch({}, FunctionFactory::CaseInsensitive); - factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive); } } diff --git a/tests/queries/0_stateless/00765_locate.reference b/tests/queries/0_stateless/00765_locate.reference new file mode 100644 index 00000000000..3b066c0d68b --- /dev/null +++ b/tests/queries/0_stateless/00765_locate.reference @@ -0,0 +1,7 @@ +-- negative tests +-- test mysql compatibility setting +0 +0 +3 +-- the function name needs to be case-insensitive for historical reasons +0 diff --git a/tests/queries/0_stateless/00765_locate.sql b/tests/queries/0_stateless/00765_locate.sql new file mode 100644 index 00000000000..3467ebd4249 --- /dev/null +++ b/tests/queries/0_stateless/00765_locate.sql @@ -0,0 +1,15 @@ +SET send_logs_level = 'fatal'; + +SELECT '-- negative tests'; +SELECT locate(); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT locate(1, 'abc'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT locate('abc', 1); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT locate('abc', 'abc', 'abc'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } + +SELECT '-- test mysql compatibility setting'; +SELECT locate('abcabc', 'ca'); +SELECT locate('abcabc', 'ca') SETTINGS function_locate_has_mysql_compatible_argument_order = true; +SELECT locate('abcabc', 'ca') SETTINGS function_locate_has_mysql_compatible_argument_order = false; + +SELECT '-- the function name needs to be case-insensitive for historical reasons'; +SELECT LoCaTe('abcabc', 'ca'); diff --git a/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference b/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference index 285b9a62d20..6d31168c2b7 100644 --- a/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference +++ b/tests/queries/0_stateless/00765_sql_compatibility_aliases.reference @@ -4,7 +4,6 @@ foo FOO baz zzz -2 fo oo o diff --git a/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql b/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql index da0eb9bea6d..995aaef9ea5 100644 --- a/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql +++ b/tests/queries/0_stateless/00765_sql_compatibility_aliases.sql @@ -6,7 +6,6 @@ select LOWER('Foo'); select UPPER('Foo'); select REPLACE('bar', 'r', 'z'); select REGEXP_REPLACE('bar', '.', 'z'); -select Locate('foo', 'o'); select SUBSTRING('foo', 1, 2); select Substr('foo', 2); select mid('foo', 3); diff --git a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference index de5a62159ef..76dd04b0e42 100644 --- a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference +++ b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.reference @@ -26,7 +26,6 @@ SELECT least(1), length('1'), log(1), - position('1', '1'), log(1), log10(1), log2(1), diff --git a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql index dda2e045e76..d8054b9757c 100644 --- a/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql +++ b/tests/queries/0_stateless/01705_normalize_case_insensitive_function_names.sql @@ -1 +1 @@ -EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH('1'), CHARACTER_LENGTH('1'), COALESCE(1), CONCAT('1', '1'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), DATEDIFF('DAY', toDate('2020-10-24'), toDate('2019-10-24')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE('A'), LEAST(1), LENGTH('1'), LN(1), LOCATE('1', '1'), LOG(1), LOG10(1), LOG2(1), LOWER('A'), MAX(1), MID('123', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION('123', '2'), POW(1, 1), POWER(1, 1), RAND(), REPLACE('1', '1', '2'), REVERSE('123'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR('123', 2), SUBSTRING('123', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE('A'), UPPER('A'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate('2020-10-24')), YEARWEEK(toDate('2020-10-24')) format TSVRaw; +EXPLAIN SYNTAX SELECT CAST(1 AS INT), CEIL(1), CEILING(1), CHAR(49), CHAR_LENGTH('1'), CHARACTER_LENGTH('1'), COALESCE(1), CONCAT('1', '1'), CORR(1, 1), COS(1), COUNT(1), COVAR_POP(1, 1), COVAR_SAMP(1, 1), DATABASE(), SCHEMA(), DATEDIFF('DAY', toDate('2020-10-24'), toDate('2019-10-24')), EXP(1), FLATTEN([[1]]), FLOOR(1), FQDN(), GREATEST(1), IF(1, 1, 1), IFNULL(1, 1), LCASE('A'), LEAST(1), LENGTH('1'), LN(1), LOG(1), LOG10(1), LOG2(1), LOWER('A'), MAX(1), MID('123', 1, 1), MIN(1), MOD(1, 1), NOT(1), NOW(), NOW64(), NULLIF(1, 1), PI(), POSITION('123', '2'), POW(1, 1), POWER(1, 1), RAND(), REPLACE('1', '1', '2'), REVERSE('123'), ROUND(1), SIN(1), SQRT(1), STDDEV_POP(1), STDDEV_SAMP(1), SUBSTR('123', 2), SUBSTRING('123', 2), SUM(1), TAN(1), TANH(1), TRUNC(1), TRUNCATE(1), UCASE('A'), UPPER('A'), USER(), VAR_POP(1), VAR_SAMP(1), WEEK(toDate('2020-10-24')), YEARWEEK(toDate('2020-10-24')) format TSVRaw; From 0b63cb237a5a72c96bbc3d4cf52ab70a5d2ad2aa Mon Sep 17 00:00:00 2001 From: kssenii Date: Sun, 10 Mar 2024 12:05:32 +0100 Subject: [PATCH 0199/1081] Fix --- src/Storages/System/StorageSystemDisks.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/System/StorageSystemDisks.cpp b/src/Storages/System/StorageSystemDisks.cpp index f67d4f7acd0..eecc889f86b 100644 --- a/src/Storages/System/StorageSystemDisks.cpp +++ b/src/Storages/System/StorageSystemDisks.cpp @@ -73,9 +73,9 @@ Pipe StorageSystemDisks::read( col_unreserved->insert(disk_ptr->getUnreservedSpace().value_or(std::numeric_limits::max())); col_keep->insert(disk_ptr->getKeepingFreeSpace()); auto data_source_description = disk_ptr->getDataSourceDescription(); - col_type->insert(data_source_description.type); - col_object_storage_type->insert(data_source_description.object_storage_type); - col_metadata_type->insert(data_source_description.metadata_type); + col_type->insert(magic_enum::enum_name(data_source_description.type)); + col_object_storage_type->insert(magic_enum::enum_name(data_source_description.object_storage_type)); + col_metadata_type->insert(magic_enum::enum_name(data_source_description.metadata_type)); col_is_encrypted->insert(data_source_description.is_encrypted); col_is_read_only->insert(disk_ptr->isReadOnly()); col_is_write_once->insert(disk_ptr->isWriteOnce()); From c1caeaa81483e33c666679fb1ce20d6a7da154c3 Mon Sep 17 00:00:00 2001 From: Han Fei Date: Sun, 10 Mar 2024 20:51:54 +0100 Subject: [PATCH 0200/1081] fix build --- src/Common/CgroupsMemoryUsageObserver.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/CgroupsMemoryUsageObserver.h b/src/Common/CgroupsMemoryUsageObserver.h index 639433b5016..fc0d847af4f 100644 --- a/src/Common/CgroupsMemoryUsageObserver.h +++ b/src/Common/CgroupsMemoryUsageObserver.h @@ -91,7 +91,7 @@ public: void setLimits(uint64_t, uint64_t) {} size_t readMemoryUsage() { return 0; } - void startThread(); + void startThread() {} size_t getHardLimit() { return 0; } size_t getSoftLimit() { return 0; } void setOnMemoryLimitUpdate(std::function) {} From 7a8d79de234e4f83a4e6421e5de49e5800ff70bc Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Sun, 10 Mar 2024 22:13:49 +0100 Subject: [PATCH 0201/1081] reload CI due to OOM in build From d2b7fb03b12c5ce3ec5a377f3872483639e8c4fe Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 10 Mar 2024 23:36:03 +0100 Subject: [PATCH 0202/1081] Fix localization in check-large-objects --- utils/check-style/check-large-objects.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh index 4eb9190512f..3e2a385bdd0 100755 --- a/utils/check-style/check-large-objects.sh +++ b/utils/check-style/check-large-objects.sh @@ -1,5 +1,7 @@ #!/usr/bin/env bash +export LC_ALL=C # The "total" should be printed without localization + # Check that there are no new translation units compiled to an object file larger than a certain size. TU_EXCLUDES=( From fc9efeddf77dc76981ccbad263566911994d33a2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 11 Mar 2024 01:42:44 +0300 Subject: [PATCH 0203/1081] Update check-large-objects.sh --- utils/check-style/check-large-objects.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/check-style/check-large-objects.sh b/utils/check-style/check-large-objects.sh index 04f4e0c3171..5faacae84b9 100755 --- a/utils/check-style/check-large-objects.sh +++ b/utils/check-style/check-large-objects.sh @@ -12,7 +12,7 @@ TU_EXCLUDES=( Aggregator ) -if find $1 -name '*.o' | xargs wc -c | grep --regexp='.o$' | sort -rn | awk '{ if ($1 > 50000000) print }' \ +if find $1 -name '*.o' | xargs wc -c | grep --regexp='\.o$' | sort -rn | awk '{ if ($1 > 50000000) print }' \ | grep -v -f <(printf "%s\n" "${TU_EXCLUDES[@]}") then echo "^ It's not allowed to have so large translation units." From b354d07b829a94a43cf6f3867585efac389088c1 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Mon, 11 Mar 2024 14:59:22 +0800 Subject: [PATCH 0204/1081] remove break --- src/Processors/Transforms/FilterTransform.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Processors/Transforms/FilterTransform.cpp b/src/Processors/Transforms/FilterTransform.cpp index 0f2509c7510..b3be9246f43 100644 --- a/src/Processors/Transforms/FilterTransform.cpp +++ b/src/Processors/Transforms/FilterTransform.cpp @@ -337,7 +337,6 @@ void FilterTransform::doTransform(Chunk & chunk) min_size_in_memory = size_in_memory; first_non_constant_column = i; } - break; } } (void)min_size_in_memory; /// Suppress error of clang-analyzer-deadcode.DeadStores From 1c82f8707d76821cbe006ed3f9be84822677cd9b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 11 Mar 2024 08:38:03 +0100 Subject: [PATCH 0205/1081] Fix tidy --- src/Functions/FunctionsConversion.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/Functions/FunctionsConversion.cpp b/src/Functions/FunctionsConversion.cpp index d2d72558500..94fd960a99e 100644 --- a/src/Functions/FunctionsConversion.cpp +++ b/src/Functions/FunctionsConversion.cpp @@ -1056,7 +1056,7 @@ struct ConvertImpl, DataTypeNumber, Name, Con } }; -static inline ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) +inline ColumnUInt8::MutablePtr copyNullMap(ColumnPtr col) { ColumnUInt8::MutablePtr null_map = nullptr; if (const auto * col_null = checkAndGetColumn(col.get())) @@ -1984,7 +1984,7 @@ struct NameParseDateTimeBestEffortOrZero; struct NameParseDateTimeBestEffortOrNull; template -static inline bool isDateTime64(const ColumnsWithTypeAndName & arguments) +inline bool isDateTime64(const ColumnsWithTypeAndName & arguments) { if constexpr (std::is_same_v) return true; @@ -3391,7 +3391,7 @@ arguments, result_type, input_rows_count); \ case IntervalKind::Kind::INTERVAL_KIND: \ return createFunctionAdaptor(FunctionConvert::create(), from_type); - static WrapperType createIntervalWrapper(const DataTypePtr & from_type, IntervalKind kind) + static WrapperType createIntervalWrapper(const DataTypePtr & from_type, IntervalKind::Kind kind) { switch (kind) { @@ -3994,7 +3994,7 @@ arguments, result_type, input_rows_count); \ { return [is_nullable = to_type->hasNullableSubcolumns()] (ColumnsWithTypeAndName & arguments, const DataTypePtr & , const ColumnNullable * , size_t) -> ColumnPtr { - auto & column_object = assert_cast(*arguments.front().column); + const auto & column_object = assert_cast(*arguments.front().column); auto res = ColumnObject::create(is_nullable); for (size_t i = 0; i < column_object.size(); i++) res->insert(column_object[i]); From 61c3d917ae82bb6ae14cea3a4a7c36c19f6d3b22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 11 Mar 2024 10:33:09 +0000 Subject: [PATCH 0206/1081] Use `boost::algorithm::join` --- src/Common/FunctionDocumentation.cpp | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/Common/FunctionDocumentation.cpp b/src/Common/FunctionDocumentation.cpp index 0dc5b48f9d1..7b554539a4f 100644 --- a/src/Common/FunctionDocumentation.cpp +++ b/src/Common/FunctionDocumentation.cpp @@ -1,5 +1,7 @@ #include +#include + namespace DB { @@ -31,15 +33,7 @@ std::string FunctionDocumentation::examplesAsString() const std::string FunctionDocumentation::categoriesAsString() const { - if (categories.empty()) - return ""; - - auto it = categories.begin(); - std::string res = *it; - ++it; - for (; it != categories.end(); ++it) - res += ", " + *it; - return res; + return boost::algorithm::join(categories, ", "); } } From 8a11afeba1e80f3f0210af5d19338550f631ceab Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 11 Mar 2024 11:44:34 +0100 Subject: [PATCH 0207/1081] Updated settings changed history --- src/Core/SettingsChangesHistory.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index b68789d5f43..d458f935edf 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -93,6 +93,12 @@ static std::map sett {"input_format_json_use_string_type_for_ambiguous_paths_in_named_tuples_inference_from_objects", false, false, "Allow to use String type for ambiguous paths during named tuple inference from JSON objects"}, {"throw_if_deduplication_in_dependent_materialized_views_enabled_with_async_insert", false, true, "Deduplication is dependent materialized view cannot work together with async inserts."}, {"parallel_replicas_allow_in_with_subquery", false, true, "If true, subquery for IN will be executed on every follower replica"}, + {"azure_max_inflight_parts_for_one_file", 20, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited."}, + {"azure_strict_upload_part_size", 0, 0, "The exact size of part to upload during multipart upload to Azure blob storage."}, + {"azure_min_upload_part_size", 16*1024*1024, 16*1024*1024, "The minimum size of part to upload during multipart upload to Azure blob storage."}, + {"azure_max_upload_part_size", 5ull*1024*1024*1024, 5ull*1024*1024*1024, "The maximum size of part to upload during multipart upload to Azure blob storage."}, + {"azure_upload_part_size_multiply_factor", 2, 2, "Multiply azure_min_upload_part_size by this factor each time azure_multiply_parts_count_threshold parts were uploaded from a single write to Azure blob storage."}, + {"azure_upload_part_size_multiply_parts_count_threshold", 500, 500, "Each time this number of parts was uploaded to Azure blob storage, azure_min_upload_part_size is multiplied by azure_upload_part_size_multiply_factor."}, }}, {"24.2", {{"allow_suspicious_variant_types", true, false, "Don't allow creating Variant type with suspicious variants by default"}, {"validate_experimental_and_suspicious_types_inside_nested_types", false, true, "Validate usage of experimental and suspicious types inside nested types"}, @@ -122,7 +128,6 @@ static std::map sett {"default_normal_view_sql_security", "INVOKER", "INVOKER", "Allows to set default `SQL SECURITY` option while creating a normal view"}, {"mysql_map_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, {"mysql_map_fixed_string_to_text_in_show_columns", false, true, "Reduce the configuration effort to connect ClickHouse with BI tools."}, - {"azure_max_inflight_parts_for_one_file", 20, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited."}, }}, {"24.1", {{"print_pretty_type_names", false, true, "Better user experience."}, {"input_format_json_read_bools_as_strings", false, true, "Allow to read bools as strings in JSON formats by default"}, From ecb11005e34948a365555da1e2271e9da31d5074 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Mar 2024 10:56:26 +0000 Subject: [PATCH 0208/1081] List clang-tidy checks as list instead of a string The string representation will be deprecated at some point: https://reviews.llvm.org/D147876 --- .clang-tidy | 195 ++++++++++++++++++++++++++-------------------------- 1 file changed, 97 insertions(+), 98 deletions(-) diff --git a/.clang-tidy b/.clang-tidy index 0dacf813c7e..4aeb38ca409 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -5,128 +5,127 @@ # a) the new check is not controversial (this includes many checks in readability-* and google-*) or # b) too noisy (checks with > 100 new warnings are considered noisy, this includes e.g. cppcoreguidelines-*). -# TODO: Once clang(-tidy) 17 is the minimum, we can convert this list to YAML -# See https://releases.llvm.org/17.0.1/tools/clang/tools/extra/docs/ReleaseNotes.html#improvements-to-clang-tidy - # TODO Let clang-tidy check headers in further directories # --> HeaderFilterRegex: '^.*/(src|base|programs|utils)/.*(h|hpp)$' HeaderFilterRegex: '^.*/(base|programs|utils)/.*(h|hpp)$' -Checks: '*, - -abseil-*, +Checks: [ + '*', - -altera-*, + '-abseil-*', - -android-*, + '-altera-*', - -bugprone-assignment-in-if-condition, - -bugprone-branch-clone, - -bugprone-easily-swappable-parameters, - -bugprone-exception-escape, - -bugprone-implicit-widening-of-multiplication-result, - -bugprone-narrowing-conversions, - -bugprone-not-null-terminated-result, - -bugprone-reserved-identifier, # useful but too slow, TODO retry when https://reviews.llvm.org/rG1c282052624f9d0bd273bde0b47b30c96699c6c7 is merged - -bugprone-unchecked-optional-access, + '-android-*', - -cert-dcl16-c, - -cert-dcl37-c, - -cert-dcl51-cpp, - -cert-err58-cpp, - -cert-msc32-c, - -cert-msc51-cpp, - -cert-oop54-cpp, - -cert-oop57-cpp, + '-bugprone-assignment-in-if-condition', + '-bugprone-branch-clone', + '-bugprone-easily-swappable-parameters', + '-bugprone-exception-escape', + '-bugprone-implicit-widening-of-multiplication-result', + '-bugprone-narrowing-conversions', + '-bugprone-not-null-terminated-result', + '-bugprone-reserved-identifier', # useful but too slow, TODO retry when https://reviews.llvm.org/rG1c282052624f9d0bd273bde0b47b30c96699c6c7 is merged + '-bugprone-unchecked-optional-access', - -clang-analyzer-unix.Malloc, + '-cert-dcl16-c', + '-cert-dcl37-c', + '-cert-dcl51-cpp', + '-cert-err58-cpp', + '-cert-msc32-c', + '-cert-msc51-cpp', + '-cert-oop54-cpp', + '-cert-oop57-cpp', - -cppcoreguidelines-*, # impractical in a codebase as large as ClickHouse, also slow + '-clang-analyzer-unix.Malloc', - -darwin-*, + '-cppcoreguidelines-*', # impractical in a codebase as large as ClickHouse, also slow - -fuchsia-*, + '-darwin-*', - -google-build-using-namespace, - -google-readability-braces-around-statements, - -google-readability-casting, - -google-readability-function-size, - -google-readability-namespace-comments, - -google-readability-todo, + '-fuchsia-*', - -hicpp-avoid-c-arrays, - -hicpp-avoid-goto, - -hicpp-braces-around-statements, - -hicpp-explicit-conversions, - -hicpp-function-size, - -hicpp-member-init, - -hicpp-move-const-arg, - -hicpp-multiway-paths-covered, - -hicpp-named-parameter, - -hicpp-no-array-decay, - -hicpp-no-assembler, - -hicpp-no-malloc, - -hicpp-signed-bitwise, - -hicpp-special-member-functions, - -hicpp-uppercase-literal-suffix, - -hicpp-use-auto, - -hicpp-use-emplace, - -hicpp-vararg, + '-google-build-using-namespace', + '-google-readability-braces-around-statements', + '-google-readability-casting', + '-google-readability-function-size', + '-google-readability-namespace-comments', + '-google-readability-todo', - -linuxkernel-*, + '-hicpp-avoid-c-arrays', + '-hicpp-avoid-goto', + '-hicpp-braces-around-statements', + '-hicpp-explicit-conversions', + '-hicpp-function-size', + '-hicpp-member-init', + '-hicpp-move-const-arg', + '-hicpp-multiway-paths-covered', + '-hicpp-named-parameter', + '-hicpp-no-array-decay', + '-hicpp-no-assembler', + '-hicpp-no-malloc', + '-hicpp-signed-bitwise', + '-hicpp-special-member-functions', + '-hicpp-uppercase-literal-suffix', + '-hicpp-use-auto', + '-hicpp-use-emplace', + '-hicpp-vararg', - -llvm-*, + '-linuxkernel-*', - -llvmlibc-*, + '-llvm-*', - -openmp-*, + '-llvmlibc-*', - -misc-const-correctness, - -misc-include-cleaner, # useful but far too many occurrences - -misc-no-recursion, - -misc-non-private-member-variables-in-classes, - -misc-confusable-identifiers, # useful but slooow - -misc-use-anonymous-namespace, + '-openmp-*', - -modernize-avoid-c-arrays, - -modernize-concat-nested-namespaces, - -modernize-macro-to-enum, - -modernize-pass-by-value, - -modernize-return-braced-init-list, - -modernize-use-auto, - -modernize-use-default-member-init, - -modernize-use-emplace, - -modernize-use-nodiscard, - -modernize-use-override, - -modernize-use-trailing-return-type, + '-misc-const-correctness', + '-misc-include-cleaner', # useful but far too many occurrences + '-misc-no-recursion', + '-misc-non-private-member-variables-in-classes', + '-misc-confusable-identifiers', # useful but slooo + '-misc-use-anonymous-namespace', - -performance-inefficient-string-concatenation, - -performance-no-int-to-ptr, - -performance-avoid-endl, - -performance-unnecessary-value-param, + '-modernize-avoid-c-arrays', + '-modernize-concat-nested-namespaces', + '-modernize-macro-to-enum', + '-modernize-pass-by-value', + '-modernize-return-braced-init-list', + '-modernize-use-auto', + '-modernize-use-default-member-init', + '-modernize-use-emplace', + '-modernize-use-nodiscard', + '-modernize-use-override', + '-modernize-use-trailing-return-type', - -portability-simd-intrinsics, + '-performance-inefficient-string-concatenation', + '-performance-no-int-to-ptr', + '-performance-avoid-endl', + '-performance-unnecessary-value-param', - -readability-avoid-unconditional-preprocessor-if, - -readability-braces-around-statements, - -readability-convert-member-functions-to-static, - -readability-else-after-return, - -readability-function-cognitive-complexity, - -readability-function-size, - -readability-identifier-length, - -readability-identifier-naming, # useful but too slow - -readability-implicit-bool-conversion, - -readability-isolate-declaration, - -readability-magic-numbers, - -readability-named-parameter, - -readability-redundant-declaration, - -readability-simplify-boolean-expr, - -readability-static-accessed-through-instance, - -readability-suspicious-call-argument, - -readability-uppercase-literal-suffix, - -readability-use-anyofallof, + '-portability-simd-intrinsics', - -zircon-*, -' + '-readability-avoid-unconditional-preprocessor-if', + '-readability-braces-around-statements', + '-readability-convert-member-functions-to-static', + '-readability-else-after-return', + '-readability-function-cognitive-complexity', + '-readability-function-size', + '-readability-identifier-length', + '-readability-identifier-naming', # useful but too slow + '-readability-implicit-bool-conversion', + '-readability-isolate-declaration', + '-readability-magic-numbers', + '-readability-named-parameter', + '-readability-redundant-declaration', + '-readability-simplify-boolean-expr', + '-readability-static-accessed-through-instance', + '-readability-suspicious-call-argument', + '-readability-uppercase-literal-suffix', + '-readability-use-anyofallof', + + '-zircon-*' +] WarningsAsErrors: '*' From e5e84419aff0f559bc545737bfdc0518a732f7ff Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sun, 10 Mar 2024 14:29:18 +0000 Subject: [PATCH 0209/1081] Fix clang-tidy-s --- contrib/libmetrohash/src/metrohash128.h | 3 +++ src/Access/AccessControl.h | 10 +++++----- src/Access/IAccessStorage.cpp | 2 +- src/Access/IAccessStorage.h | 2 +- src/Common/Arena.h | 4 +--- src/Common/DNSResolver.cpp | 2 +- src/Common/DNSResolver.h | 2 +- src/Common/DateLUTImpl.h | 2 +- src/Common/MultiVersion.h | 4 ++-- src/Common/PODArray.h | 6 +++--- src/Common/SipHash.h | 2 +- src/Common/TransactionID.h | 2 +- src/Common/ZooKeeper/IKeeper.cpp | 8 ++++---- src/Common/ZooKeeper/IKeeper.h | 16 ++++++++-------- src/Common/logger_useful.h | 16 ++++++++-------- src/Core/PostgreSQL/insertPostgreSQLValue.cpp | 4 ++-- src/Core/PostgreSQL/insertPostgreSQLValue.h | 4 ++-- src/Core/Settings.h | 2 ++ src/Dictionaries/CacheDictionary.cpp | 4 ++-- src/Dictionaries/CacheDictionary.h | 2 +- .../GeodataProviders/IHierarchiesProvider.h | 2 +- src/Dictionaries/RegExpTreeDictionary.cpp | 2 +- src/Dictionaries/RegExpTreeDictionary.h | 2 +- src/Functions/IFunction.h | 4 ---- src/IO/ReadSettings.h | 1 + src/Interpreters/AsynchronousInsertQueue.cpp | 2 +- src/Interpreters/AsynchronousInsertQueue.h | 2 +- src/Interpreters/Context.h | 4 ++-- src/Interpreters/IExternalLoadable.h | 2 +- src/Interpreters/ProcessList.h | 2 +- src/Processors/Chunk.h | 2 +- .../Algorithms/AggregatingSortedAlgorithm.cpp | 2 +- src/Processors/Port.h | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp | 4 ++-- src/Processors/TTL/TTLUpdateInfoAlgorithm.h | 4 ++-- src/Storages/StorageInMemoryMetadata.h | 4 ++-- 36 files changed, 69 insertions(+), 69 deletions(-) diff --git a/contrib/libmetrohash/src/metrohash128.h b/contrib/libmetrohash/src/metrohash128.h index 2dbb6ca5a8a..f507c917caf 100644 --- a/contrib/libmetrohash/src/metrohash128.h +++ b/contrib/libmetrohash/src/metrohash128.h @@ -17,6 +17,8 @@ #ifndef METROHASH_METROHASH_128_H #define METROHASH_METROHASH_128_H +// NOLINTBEGIN(readability-avoid-const-params-in-decls) + #include class MetroHash128 @@ -68,5 +70,6 @@ private: void metrohash128_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); void metrohash128_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out); +// NOLINTEND(readability-avoid-const-params-in-decls) #endif // #ifndef METROHASH_METROHASH_128_H diff --git a/src/Access/AccessControl.h b/src/Access/AccessControl.h index 55ea4e4f717..1af74e02fb7 100644 --- a/src/Access/AccessControl.h +++ b/src/Access/AccessControl.h @@ -133,20 +133,20 @@ public: /// This function also enables custom prefixes to be used. void setCustomSettingsPrefixes(const Strings & prefixes); void setCustomSettingsPrefixes(const String & comma_separated_prefixes); - bool isSettingNameAllowed(const std::string_view name) const; - void checkSettingNameIsAllowed(const std::string_view name) const; + bool isSettingNameAllowed(std::string_view name) const; + void checkSettingNameIsAllowed(std::string_view name) const; /// Allows implicit user creation without password (by default it's allowed). /// In other words, allow 'CREATE USER' queries without 'IDENTIFIED WITH' clause. - void setImplicitNoPasswordAllowed(const bool allow_implicit_no_password_); + void setImplicitNoPasswordAllowed(bool allow_implicit_no_password_); bool isImplicitNoPasswordAllowed() const; /// Allows users without password (by default it's allowed). - void setNoPasswordAllowed(const bool allow_no_password_); + void setNoPasswordAllowed(bool allow_no_password_); bool isNoPasswordAllowed() const; /// Allows users with plaintext password (by default it's allowed). - void setPlaintextPasswordAllowed(const bool allow_plaintext_password_); + void setPlaintextPasswordAllowed(bool allow_plaintext_password_); bool isPlaintextPasswordAllowed() const; /// Default password type when the user does not specify it. diff --git a/src/Access/IAccessStorage.cpp b/src/Access/IAccessStorage.cpp index fbe9e231002..1d6b8d99cd5 100644 --- a/src/Access/IAccessStorage.cpp +++ b/src/Access/IAccessStorage.cpp @@ -616,7 +616,7 @@ UUID IAccessStorage::generateRandomID() } -void IAccessStorage::clearConflictsInEntitiesList(std::vector> & entities, const LoggerPtr log_) +void IAccessStorage::clearConflictsInEntitiesList(std::vector> & entities, LoggerPtr log_) { std::unordered_map positions_by_id; std::unordered_map positions_by_type_and_name[static_cast(AccessEntityType::MAX)]; diff --git a/src/Access/IAccessStorage.h b/src/Access/IAccessStorage.h index ebb5a39cdf0..ad78bf92e02 100644 --- a/src/Access/IAccessStorage.h +++ b/src/Access/IAccessStorage.h @@ -228,7 +228,7 @@ protected: static UUID generateRandomID(); LoggerPtr getLogger() const; static String formatEntityTypeWithName(AccessEntityType type, const String & name) { return AccessEntityTypeInfo::get(type).formatEntityNameWithType(name); } - static void clearConflictsInEntitiesList(std::vector> & entities, const LoggerPtr log_); + static void clearConflictsInEntitiesList(std::vector> & entities, LoggerPtr log_); [[noreturn]] void throwNotFound(const UUID & id) const; [[noreturn]] void throwNotFound(AccessEntityType type, const String & name) const; [[noreturn]] static void throwBadCast(const UUID & id, AccessEntityType type, const String & name, AccessEntityType required_type); diff --git a/src/Common/Arena.h b/src/Common/Arena.h index cb26397844b..ba5b9ea9205 100644 --- a/src/Common/Arena.h +++ b/src/Common/Arena.h @@ -47,9 +47,7 @@ private: std::unique_ptr prev; - MemoryChunk() - { - } + MemoryChunk() = default; void swap(MemoryChunk & other) { diff --git a/src/Common/DNSResolver.cpp b/src/Common/DNSResolver.cpp index e36e1483da8..4b577a251af 100644 --- a/src/Common/DNSResolver.cpp +++ b/src/Common/DNSResolver.cpp @@ -297,7 +297,7 @@ void DNSResolver::setDisableCacheFlag(bool is_disabled) impl->disable_cache = is_disabled; } -void DNSResolver::setCacheMaxEntries(const UInt64 cache_max_entries) +void DNSResolver::setCacheMaxEntries(UInt64 cache_max_entries) { impl->cache_address.setMaxSizeInBytes(cache_max_entries); impl->cache_host.setMaxSizeInBytes(cache_max_entries); diff --git a/src/Common/DNSResolver.h b/src/Common/DNSResolver.h index e3030e51a96..1ddd9d3b991 100644 --- a/src/Common/DNSResolver.h +++ b/src/Common/DNSResolver.h @@ -56,7 +56,7 @@ public: void setDisableCacheFlag(bool is_disabled = true); /// Set a limit of entries in cache - void setCacheMaxEntries(const UInt64 cache_max_entries); + void setCacheMaxEntries(UInt64 cache_max_entries); /// Drops all caches void dropCache(); diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 7bf66c0504a..4087e77d588 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -255,7 +255,7 @@ private: static LUTIndex toLUTIndex(ExtendedDayNum d) { - return normalizeLUTIndex(static_cast(d + daynum_offset_epoch)); + return normalizeLUTIndex(static_cast(d) + daynum_offset_epoch); } LUTIndex toLUTIndex(Time t) const diff --git a/src/Common/MultiVersion.h b/src/Common/MultiVersion.h index 8f488f9fcbc..680e224f869 100644 --- a/src/Common/MultiVersion.h +++ b/src/Common/MultiVersion.h @@ -41,9 +41,9 @@ public: } /// There is no copy constructor because only one MultiVersion should own the same object. - MultiVersion(MultiVersion && src) { *this = std::move(src); } + MultiVersion(MultiVersion && src) { *this = std::move(src); } /// NOLINT - MultiVersion & operator=(MultiVersion && src) + MultiVersion & operator=(MultiVersion && src) /// NOLINT { if (this != &src) { diff --git a/src/Common/PODArray.h b/src/Common/PODArray.h index 1a4047a2588..af863e01fb2 100644 --- a/src/Common/PODArray.h +++ b/src/Common/PODArray.h @@ -25,7 +25,7 @@ */ template constexpr bool memcpy_can_be_used_for_assignment = std::is_same_v - || (std::is_integral_v && std::is_integral_v && sizeof(T) == sizeof(U)); + || (std::is_integral_v && std::is_integral_v && sizeof(T) == sizeof(U)); /// NOLINT(misc-redundant-expression) namespace DB { @@ -558,7 +558,7 @@ public: } template - void swap(PODArray & rhs, TAllocatorParams &&... allocator_params) + void swap(PODArray & rhs, TAllocatorParams &&... allocator_params) /// NOLINT(performance-noexcept-swap) { #ifndef NDEBUG this->unprotect(); @@ -756,7 +756,7 @@ public: }; template -void swap(PODArray & lhs, PODArray & rhs) +void swap(PODArray & lhs, PODArray & rhs) /// NOLINT { lhs.swap(rhs); } diff --git a/src/Common/SipHash.h b/src/Common/SipHash.h index 729fb76a573..c89ee2c9d90 100644 --- a/src/Common/SipHash.h +++ b/src/Common/SipHash.h @@ -149,7 +149,7 @@ public: /// Pad the remainder, which is missing up to an 8-byte word. current_word = 0; - switch (end - data) + switch (end - data) /// NOLINT(bugprone-switch-missing-default-case) { case 7: current_bytes[CURRENT_BYTES_IDX(6)] = data[6]; [[fallthrough]]; case 6: current_bytes[CURRENT_BYTES_IDX(5)] = data[5]; [[fallthrough]]; diff --git a/src/Common/TransactionID.h b/src/Common/TransactionID.h index 3ab86f7589c..97d0072bc14 100644 --- a/src/Common/TransactionID.h +++ b/src/Common/TransactionID.h @@ -16,7 +16,7 @@ class MergeTreeTransaction; /// or transaction object is not needed and not passed intentionally. #ifndef NO_TRANSACTION_PTR #define NO_TRANSACTION_PTR std::shared_ptr(nullptr) -#define NO_TRANSACTION_RAW static_cast(nullptr) +#define NO_TRANSACTION_RAW static_cast(nullptr) /// NOLINT(bugprone-macro-parentheses) #endif /// Commit Sequence Number diff --git a/src/Common/ZooKeeper/IKeeper.cpp b/src/Common/ZooKeeper/IKeeper.cpp index 6c47ea68b84..7d2602bde1e 100644 --- a/src/Common/ZooKeeper/IKeeper.cpp +++ b/src/Common/ZooKeeper/IKeeper.cpp @@ -23,7 +23,7 @@ namespace ProfileEvents namespace Coordination { -void Exception::incrementErrorMetrics(const Error code_) +void Exception::incrementErrorMetrics(Error code_) { if (Coordination::isUserError(code_)) ProfileEvents::increment(ProfileEvents::ZooKeeperUserExceptions); @@ -33,14 +33,14 @@ void Exception::incrementErrorMetrics(const Error code_) ProfileEvents::increment(ProfileEvents::ZooKeeperOtherExceptions); } -Exception::Exception(const std::string & msg, const Error code_, int) +Exception::Exception(const std::string & msg, Error code_, int) : DB::Exception(msg, DB::ErrorCodes::KEEPER_EXCEPTION) , code(code_) { incrementErrorMetrics(code); } -Exception::Exception(PreformattedMessage && msg, const Error code_) +Exception::Exception(PreformattedMessage && msg, Error code_) : DB::Exception(std::move(msg), DB::ErrorCodes::KEEPER_EXCEPTION) , code(code_) { @@ -48,7 +48,7 @@ Exception::Exception(PreformattedMessage && msg, const Error code_) incrementErrorMetrics(code); } -Exception::Exception(const Error code_) +Exception::Exception(Error code_) : Exception(code_, "Coordination error: {}", errorMessage(code_)) { } diff --git a/src/Common/ZooKeeper/IKeeper.h b/src/Common/ZooKeeper/IKeeper.h index c7b902ea03a..ec49c94808e 100644 --- a/src/Common/ZooKeeper/IKeeper.h +++ b/src/Common/ZooKeeper/IKeeper.h @@ -466,13 +466,13 @@ class Exception : public DB::Exception { private: /// Delegate constructor, used to minimize repetition; last parameter used for overload resolution. - Exception(const std::string & msg, const Error code_, int); /// NOLINT - Exception(PreformattedMessage && msg, const Error code_); + Exception(const std::string & msg, Error code_, int); /// NOLINT + Exception(PreformattedMessage && msg, Error code_); /// Message must be a compile-time constant template requires std::is_convertible_v - Exception(T && message, const Error code_) : DB::Exception(std::forward(message), DB::ErrorCodes::KEEPER_EXCEPTION, /* remote_= */ false), code(code_) + Exception(T && message, Error code_) : DB::Exception(std::forward(message), DB::ErrorCodes::KEEPER_EXCEPTION, /* remote_= */ false), code(code_) { incrementErrorMetrics(code); } @@ -480,23 +480,23 @@ private: static void incrementErrorMetrics(Error code_); public: - explicit Exception(const Error code_); /// NOLINT + explicit Exception(Error code_); /// NOLINT Exception(const Exception & exc); template - Exception(const Error code_, FormatStringHelper fmt, Args &&... args) + Exception(Error code_, FormatStringHelper fmt, Args &&... args) : DB::Exception(DB::ErrorCodes::KEEPER_EXCEPTION, std::move(fmt), std::forward(args)...) , code(code_) { incrementErrorMetrics(code); } - inline static Exception createDeprecated(const std::string & msg, const Error code_) + inline static Exception createDeprecated(const std::string & msg, Error code_) { return Exception(msg, code_, 0); } - inline static Exception fromPath(const Error code_, const std::string & path) + inline static Exception fromPath(Error code_, const std::string & path) { return Exception(code_, "Coordination error: {}, path {}", errorMessage(code_), path); } @@ -504,7 +504,7 @@ public: /// Message must be a compile-time constant template requires std::is_convertible_v - inline static Exception fromMessage(const Error code_, T && message) + inline static Exception fromMessage(Error code_, T && message) { return Exception(std::forward(message), code_); } diff --git a/src/Common/logger_useful.h b/src/Common/logger_useful.h index 8e78e93e198..013b35e695e 100644 --- a/src/Common/logger_useful.h +++ b/src/Common/logger_useful.h @@ -19,14 +19,14 @@ namespace Poco { class Logger; } using LogSeriesLimiterPtr = std::shared_ptr; -namespace +namespace impl { - [[maybe_unused]] LoggerPtr getLoggerHelper(const LoggerPtr & logger) { return logger; } - [[maybe_unused]] LoggerPtr getLoggerHelper(const AtomicLogger & logger) { return logger.load(); } - [[maybe_unused]] const ::Poco::Logger * getLoggerHelper(const ::Poco::Logger * logger) { return logger; } - [[maybe_unused]] std::unique_ptr getLoggerHelper(std::unique_ptr && logger) { return logger; } - [[maybe_unused]] std::unique_ptr getLoggerHelper(std::unique_ptr && logger) { return logger; } - [[maybe_unused]] LogSeriesLimiterPtr getLoggerHelper(LogSeriesLimiterPtr & logger) { return logger; } + [[maybe_unused]] inline LoggerPtr getLoggerHelper(const LoggerPtr & logger) { return logger; } + [[maybe_unused]] inline LoggerPtr getLoggerHelper(const AtomicLogger & logger) { return logger.load(); } + [[maybe_unused]] inline const ::Poco::Logger * getLoggerHelper(const ::Poco::Logger * logger) { return logger; } + [[maybe_unused]] inline std::unique_ptr getLoggerHelper(std::unique_ptr && logger) { return logger; } + [[maybe_unused]] inline std::unique_ptr getLoggerHelper(std::unique_ptr && logger) { return logger; } + [[maybe_unused]] inline LogSeriesLimiterPtr getLoggerHelper(LogSeriesLimiterPtr & logger) { return logger; } } #define LOG_IMPL_FIRST_ARG(X, ...) X @@ -65,7 +65,7 @@ namespace #define LOG_IMPL(logger, priority, PRIORITY, ...) do \ { \ - auto _logger = ::getLoggerHelper(logger); \ + auto _logger = ::impl::getLoggerHelper(logger); \ const bool _is_clients_log = (DB::CurrentThread::getGroup() != nullptr) && \ (DB::CurrentThread::get().getClientLogsLevel() >= (priority)); \ if (!_is_clients_log && !_logger->is((PRIORITY))) \ diff --git a/src/Core/PostgreSQL/insertPostgreSQLValue.cpp b/src/Core/PostgreSQL/insertPostgreSQLValue.cpp index aa60bdee28a..b507b300769 100644 --- a/src/Core/PostgreSQL/insertPostgreSQLValue.cpp +++ b/src/Core/PostgreSQL/insertPostgreSQLValue.cpp @@ -36,7 +36,7 @@ void insertDefaultPostgreSQLValue(IColumn & column, const IColumn & sample_colum void insertPostgreSQLValue( IColumn & column, std::string_view value, - const ExternalResultDescription::ValueType type, const DataTypePtr data_type, + ExternalResultDescription::ValueType type, DataTypePtr data_type, const std::unordered_map & array_info, size_t idx) { switch (type) @@ -170,7 +170,7 @@ void insertPostgreSQLValue( void preparePostgreSQLArrayInfo( - std::unordered_map & array_info, size_t column_idx, const DataTypePtr data_type) + std::unordered_map & array_info, size_t column_idx, DataTypePtr data_type) { const auto * array_type = typeid_cast(data_type.get()); auto nested = array_type->getNestedType(); diff --git a/src/Core/PostgreSQL/insertPostgreSQLValue.h b/src/Core/PostgreSQL/insertPostgreSQLValue.h index 3bc83292b96..bfb85422aa1 100644 --- a/src/Core/PostgreSQL/insertPostgreSQLValue.h +++ b/src/Core/PostgreSQL/insertPostgreSQLValue.h @@ -22,11 +22,11 @@ struct PostgreSQLArrayInfo void insertPostgreSQLValue( IColumn & column, std::string_view value, - const ExternalResultDescription::ValueType type, const DataTypePtr data_type, + ExternalResultDescription::ValueType type, DataTypePtr data_type, const std::unordered_map & array_info, size_t idx); void preparePostgreSQLArrayInfo( - std::unordered_map & array_info, size_t column_idx, const DataTypePtr data_type); + std::unordered_map & array_info, size_t column_idx, DataTypePtr data_type); void insertDefaultPostgreSQLValue(IColumn & column, const IColumn & sample_column); diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d70a6cf51c5..c41db9d2141 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1192,6 +1192,7 @@ class IColumn; FORMAT_FACTORY_SETTINGS(M, ALIAS) \ OBSOLETE_FORMAT_SETTINGS(M, ALIAS) \ +/// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) DECLARE_SETTINGS_TRAITS_ALLOW_CUSTOM_SETTINGS(SettingsTraits, LIST_OF_SETTINGS) @@ -1236,6 +1237,7 @@ private: /* * User-specified file format settings for File and URL engines. */ +/// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) DECLARE_SETTINGS_TRAITS(FormatFactorySettingsTraits, LIST_OF_ALL_FORMAT_SETTINGS) struct FormatFactorySettings : public BaseSettings diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 8444042db9e..6e9b09f8919 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -423,7 +423,7 @@ MutableColumns CacheDictionary::aggregateColumnsInOrderOfKe const DictionaryStorageFetchRequest & request, const MutableColumns & fetched_columns, const PaddedPODArray & key_index_to_state, - IColumn::Filter * const default_mask) const + IColumn::Filter * default_mask) const { MutableColumns aggregated_columns = request.makeAttributesResultColumns(); @@ -473,7 +473,7 @@ MutableColumns CacheDictionary::aggregateColumns( const PaddedPODArray & key_index_to_fetched_columns_from_storage_result, const MutableColumns & fetched_columns_during_update, const HashMap & found_keys_to_fetched_columns_during_update_index, - IColumn::Filter * const default_mask) const + IColumn::Filter * default_mask) const { /** * Aggregation of columns fetched from storage and from source during update. diff --git a/src/Dictionaries/CacheDictionary.h b/src/Dictionaries/CacheDictionary.h index 8897fb40fa9..c02fb91c60e 100644 --- a/src/Dictionaries/CacheDictionary.h +++ b/src/Dictionaries/CacheDictionary.h @@ -162,7 +162,7 @@ private: const DictionaryStorageFetchRequest & request, const MutableColumns & fetched_columns, const PaddedPODArray & key_index_to_state, - IColumn::Filter * const default_mask = nullptr) const; + IColumn::Filter * default_mask = nullptr) const; MutableColumns aggregateColumns( const PaddedPODArray & keys, diff --git a/src/Dictionaries/Embedded/GeodataProviders/IHierarchiesProvider.h b/src/Dictionaries/Embedded/GeodataProviders/IHierarchiesProvider.h index 68ab0fdca2d..a4b88127786 100644 --- a/src/Dictionaries/Embedded/GeodataProviders/IHierarchiesProvider.h +++ b/src/Dictionaries/Embedded/GeodataProviders/IHierarchiesProvider.h @@ -14,7 +14,7 @@ class IRegionsHierarchyReader public: virtual bool readNext(RegionEntry & entry) = 0; - virtual ~IRegionsHierarchyReader() {} + virtual ~IRegionsHierarchyReader() = default; }; using IRegionsHierarchyReaderPtr = std::unique_ptr; diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index 4d82aa9ca0e..1f5c2d6d2c7 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -568,7 +568,7 @@ bool RegExpTreeDictionary::setAttributesShortCircuit( const String & data, std::unordered_set & visited_nodes, const std::unordered_map & attributes, - std::unordered_set * const defaults) const + std::unordered_set * defaults) const { if (visited_nodes.contains(id)) return attributes_to_set.attributesFull() == attributes.size(); diff --git a/src/Dictionaries/RegExpTreeDictionary.h b/src/Dictionaries/RegExpTreeDictionary.h index 9e14abb49d0..d6bc90ef651 100644 --- a/src/Dictionaries/RegExpTreeDictionary.h +++ b/src/Dictionaries/RegExpTreeDictionary.h @@ -210,7 +210,7 @@ private: const String & data, std::unordered_set & visited_nodes, const std::unordered_map & attributes, - std::unordered_set * const defaults) const; + std::unordered_set * defaults) const; struct RegexTreeNode; using RegexTreeNodePtr = std::shared_ptr; diff --git a/src/Functions/IFunction.h b/src/Functions/IFunction.h index 05aa08e2ad7..9b7cdf12d57 100644 --- a/src/Functions/IFunction.h +++ b/src/Functions/IFunction.h @@ -13,10 +13,6 @@ #include -#if USE_EMBEDDED_COMPILER -# include -#endif - /// This file contains user interface for functions. namespace llvm diff --git a/src/IO/ReadSettings.h b/src/IO/ReadSettings.h index 38904df4403..5c401c0c8d9 100644 --- a/src/IO/ReadSettings.h +++ b/src/IO/ReadSettings.h @@ -63,6 +63,7 @@ enum class RemoteFSReadMethod class MMappedFileCache; class PageCache; +/// NOLINTNEXTLINE(clang-analyzer-optin.performance.Padding) struct ReadSettings { /// Method to use reading from local filesystem. diff --git a/src/Interpreters/AsynchronousInsertQueue.cpp b/src/Interpreters/AsynchronousInsertQueue.cpp index 7d56dbabe3c..9327f31b6ff 100644 --- a/src/Interpreters/AsynchronousInsertQueue.cpp +++ b/src/Interpreters/AsynchronousInsertQueue.cpp @@ -905,7 +905,7 @@ Chunk AsynchronousInsertQueue::processEntriesWithParsing( const InsertDataPtr & data, const Block & header, const ContextPtr & insert_context, - const LoggerPtr logger, + LoggerPtr logger, LogFunc && add_to_async_insert_log) { size_t total_rows = 0; diff --git a/src/Interpreters/AsynchronousInsertQueue.h b/src/Interpreters/AsynchronousInsertQueue.h index f60b3d343fb..5076701d0b0 100644 --- a/src/Interpreters/AsynchronousInsertQueue.h +++ b/src/Interpreters/AsynchronousInsertQueue.h @@ -265,7 +265,7 @@ private: const InsertDataPtr & data, const Block & header, const ContextPtr & insert_context, - const LoggerPtr logger, + LoggerPtr logger, LogFunc && add_to_async_insert_log); template diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h index c8aa3604a6f..43df8d6adf2 100644 --- a/src/Interpreters/Context.h +++ b/src/Interpreters/Context.h @@ -330,7 +330,7 @@ protected: return *this; } - void swap(QueryAccessInfo & rhs) + void swap(QueryAccessInfo & rhs) noexcept { std::swap(databases, rhs.databases); std::swap(tables, rhs.tables); @@ -680,7 +680,7 @@ public: void addSpecialScalar(const String & name, const Block & block); const QueryAccessInfo & getQueryAccessInfo() const { return *getQueryAccessInfoPtr(); } - const QueryAccessInfoPtr getQueryAccessInfoPtr() const { return query_access_info; } + QueryAccessInfoPtr getQueryAccessInfoPtr() const { return query_access_info; } void setQueryAccessInfo(QueryAccessInfoPtr other) { query_access_info = other; } void addQueryAccessInfo( diff --git a/src/Interpreters/IExternalLoadable.h b/src/Interpreters/IExternalLoadable.h index 3c004508b0a..47031778876 100644 --- a/src/Interpreters/IExternalLoadable.h +++ b/src/Interpreters/IExternalLoadable.h @@ -23,7 +23,7 @@ struct ExternalLoadableLifetime UInt64 max_sec = 0; ExternalLoadableLifetime(const Poco::Util::AbstractConfiguration & config, const std::string & config_prefix); - ExternalLoadableLifetime() {} + ExternalLoadableLifetime() = default; }; /// Get delay before trying to load again after error. diff --git a/src/Interpreters/ProcessList.h b/src/Interpreters/ProcessList.h index 1c253f562e8..ad47041c762 100644 --- a/src/Interpreters/ProcessList.h +++ b/src/Interpreters/ProcessList.h @@ -318,7 +318,7 @@ public: ~ProcessListEntry(); QueryStatusPtr getQueryStatus() { return *it; } - const QueryStatusPtr getQueryStatus() const { return *it; } + QueryStatusPtr getQueryStatus() const { return *it; } }; diff --git a/src/Processors/Chunk.h b/src/Processors/Chunk.h index 9a7d6bc294d..4f753798eaa 100644 --- a/src/Processors/Chunk.h +++ b/src/Processors/Chunk.h @@ -59,7 +59,7 @@ public: Chunk clone() const; - void swap(Chunk & other) + void swap(Chunk & other) noexcept { columns.swap(other.columns); chunk_info.swap(other.chunk_info); diff --git a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp index d2d2434c477..3bd0b532d90 100644 --- a/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp +++ b/src/Processors/Merges/Algorithms/AggregatingSortedAlgorithm.cpp @@ -126,7 +126,7 @@ static void postprocessChunk(Chunk & chunk, const AggregatingSortedAlgorithm::Co AggregatingSortedAlgorithm::SimpleAggregateDescription::SimpleAggregateDescription( - AggregateFunctionPtr function_, const size_t column_number_, + AggregateFunctionPtr function_, size_t column_number_, DataTypePtr nested_type_, DataTypePtr real_type_) : function(std::move(function_)), column_number(column_number_) , nested_type(std::move(nested_type_)), real_type(std::move(real_type_)) diff --git a/src/Processors/Port.h b/src/Processors/Port.h index 67af2f041aa..f3c7bbb5fee 100644 --- a/src/Processors/Port.h +++ b/src/Processors/Port.h @@ -110,7 +110,7 @@ protected: return result; } - uintptr_t ALWAYS_INLINE swap(std::atomic & value, std::uintptr_t flags, std::uintptr_t mask) + uintptr_t ALWAYS_INLINE swap(std::atomic & value, std::uintptr_t flags, std::uintptr_t mask) /// NOLINT { Data * expected = nullptr; Data * desired = getPtr(flags | getUInt(data)); diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index b7cddf3c165..13d3030bbb8 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -6,8 +6,8 @@ namespace DB TTLUpdateInfoAlgorithm::TTLUpdateInfoAlgorithm( const TTLExpressions & ttl_expressions_, const TTLDescription & description_, - const TTLUpdateField ttl_update_field_, - const String ttl_update_key_, + TTLUpdateField ttl_update_field_, + String ttl_update_key_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h index 0cf31765aef..b6aee6f7cb0 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h @@ -22,8 +22,8 @@ public: TTLUpdateInfoAlgorithm( const TTLExpressions & ttl_expressions_, const TTLDescription & description_, - const TTLUpdateField ttl_update_field_, - const String ttl_update_key_, + TTLUpdateField ttl_update_field_, + String ttl_update_key_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_ ); diff --git a/src/Storages/StorageInMemoryMetadata.h b/src/Storages/StorageInMemoryMetadata.h index 2823aba1224..69cd3422a7d 100644 --- a/src/Storages/StorageInMemoryMetadata.h +++ b/src/Storages/StorageInMemoryMetadata.h @@ -72,8 +72,8 @@ struct StorageInMemoryMetadata StorageInMemoryMetadata(const StorageInMemoryMetadata & other); StorageInMemoryMetadata & operator=(const StorageInMemoryMetadata & other); - StorageInMemoryMetadata(StorageInMemoryMetadata && other) = default; - StorageInMemoryMetadata & operator=(StorageInMemoryMetadata && other) = default; + StorageInMemoryMetadata(StorageInMemoryMetadata && other) = default; /// NOLINT + StorageInMemoryMetadata & operator=(StorageInMemoryMetadata && other) = default; /// NOLINT /// NOTE: Thread unsafe part. You should not modify same StorageInMemoryMetadata /// structure from different threads. It should be used as MultiVersion From e562d97ff59739da3ca3650a33644bf9700becd7 Mon Sep 17 00:00:00 2001 From: Nikolay Yankin <211292+kolya7k@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:07:19 +0300 Subject: [PATCH 0210/1081] Update install.md https://packages.clickhouse.com/tgz/stable/ is multi-paged now and sorted by date asc so we can't get very last version --- docs/en/getting-started/install.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md index 234420de374..3b01434ecc5 100644 --- a/docs/en/getting-started/install.md +++ b/docs/en/getting-started/install.md @@ -262,7 +262,7 @@ The required version can be downloaded with `curl` or `wget` from repository htt After that downloaded archives should be unpacked and installed with installation scripts. Example for the latest stable version: ``` bash -LATEST_VERSION=$(curl -s https://packages.clickhouse.com/tgz/stable/ | \ +LATEST_VERSION=$(curl -s -L https://api.github.com/repos/ClickHouse/ClickHouse/tags | \ grep -Eo '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | sort -V -r | head -n 1) export LATEST_VERSION From 90b27432a26c0a5204e09ff5ff5f2ae8df3055af Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 11 Mar 2024 12:18:58 +0100 Subject: [PATCH 0211/1081] Update test.py --- tests/integration/test_backup_restore_s3/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index 452a9143067..f3f4837c317 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -130,7 +130,7 @@ def check_system_tables(backup_query_id=None): if disk ] expected_disks = ( - ("default", "local", "", ""), + ("default", "local", "None", "None"), ("disk_s3", "object_storage", "s3", "local"), ("disk_s3_cache", "object_storage", "s3", "local"), ("disk_s3_other_bucket", "object_storage", "s3", "local"), From 57f6263f67dd91e624003199295c840a228947a0 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Mar 2024 12:31:40 +0100 Subject: [PATCH 0212/1081] Lock contention fix --- src/Common/ProfileEvents.cpp | 1 + src/Interpreters/Cache/FileCache.cpp | 12 +++++++++--- src/Interpreters/Cache/FileCache.h | 2 +- src/Interpreters/Cache/FileCache_fwd.h | 1 + src/Interpreters/Cache/Guards.h | 15 ++++++++++++--- 5 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index c1ac3d08245..ab1a16a3edf 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -476,6 +476,7 @@ The server successfully detected this situation and will download merged part fr M(FileSegmentRemoveMicroseconds, "File segment remove() time") \ M(FileSegmentHolderCompleteMicroseconds, "File segments holder complete() time") \ M(FileSegmentFailToIncreasePriority, "Number of times the priority was not increased due to a high contention on the cache lock") \ + M(FilesystemCacheFailToReserveSpaceBecauseOfLockContention, "Number of times space reservation was skipped due to a high contention on the cache lock") \ M(FilesystemCacheHoldFileSegments, "Filesystem cache file segments count, which were hold") \ M(FilesystemCacheUnusedHoldFileSegments, "Filesystem cache file segments count, which were hold, but not used (because of seek or LIMIT n, etc)") \ \ diff --git a/src/Interpreters/Cache/FileCache.cpp b/src/Interpreters/Cache/FileCache.cpp index 9c705ddc27c..5650b9ce44e 100644 --- a/src/Interpreters/Cache/FileCache.cpp +++ b/src/Interpreters/Cache/FileCache.cpp @@ -27,6 +27,7 @@ namespace ProfileEvents extern const Event FilesystemCacheReserveMicroseconds; extern const Event FilesystemCacheGetOrSetMicroseconds; extern const Event FilesystemCacheGetMicroseconds; + extern const Event FilesystemCacheFailToReserveSpaceBecauseOfLockContention; } namespace DB @@ -188,9 +189,9 @@ CacheGuard::Lock FileCache::lockCache() const return cache_guard.lock(); } -CacheGuard::Lock FileCache::tryLockCache() const +CacheGuard::Lock FileCache::tryLockCache(std::optional acquire_timeout) const { - return cache_guard.tryLock(); + return acquire_timeout.has_value() ? cache_guard.tryLockFor(acquire_timeout.value()) : cache_guard.tryLock(); } FileSegments FileCache::getImpl(const LockedKey & locked_key, const FileSegment::Range & range, size_t file_segments_limit) const @@ -781,7 +782,12 @@ bool FileCache::tryReserve( ProfileEventTimeIncrement watch(ProfileEvents::FilesystemCacheReserveMicroseconds); assertInitialized(); - auto cache_lock = lockCache(); + auto cache_lock = tryLockCache(std::chrono::milliseconds(FILECACHE_TRY_RESERVE_LOCK_TIMEOUT_MILLISECONDS)); + if (!cache_lock) + { + ProfileEvents::increment(ProfileEvents::FilesystemCacheFailToReserveSpaceBecauseOfLockContention); + return false; + } LOG_TEST( log, "Trying to reserve space ({} bytes) for {}:{}, current usage {}/{}", diff --git a/src/Interpreters/Cache/FileCache.h b/src/Interpreters/Cache/FileCache.h index 5b665ad0271..7434b2ac78a 100644 --- a/src/Interpreters/Cache/FileCache.h +++ b/src/Interpreters/Cache/FileCache.h @@ -173,7 +173,7 @@ public: void deactivateBackgroundOperations(); CacheGuard::Lock lockCache() const; - CacheGuard::Lock tryLockCache() const; + CacheGuard::Lock tryLockCache(std::optional acquire_timeout = std::nullopt) const; std::vector sync(); diff --git a/src/Interpreters/Cache/FileCache_fwd.h b/src/Interpreters/Cache/FileCache_fwd.h index 06261b19db7..eaed279e7fd 100644 --- a/src/Interpreters/Cache/FileCache_fwd.h +++ b/src/Interpreters/Cache/FileCache_fwd.h @@ -12,6 +12,7 @@ static constexpr int FILECACHE_DEFAULT_LOAD_METADATA_THREADS = 16; static constexpr int FILECACHE_DEFAULT_MAX_ELEMENTS = 10000000; static constexpr int FILECACHE_DEFAULT_HITS_THRESHOLD = 0; static constexpr size_t FILECACHE_BYPASS_THRESHOLD = 256 * 1024 * 1024; +static constexpr size_t FILECACHE_TRY_RESERVE_LOCK_TIMEOUT_MILLISECONDS = 1000; /// 1 sec. class FileCache; using FileCachePtr = std::shared_ptr; diff --git a/src/Interpreters/Cache/Guards.h b/src/Interpreters/Cache/Guards.h index 5729620d82f..0ac7cb80483 100644 --- a/src/Interpreters/Cache/Guards.h +++ b/src/Interpreters/Cache/Guards.h @@ -61,17 +61,26 @@ namespace DB */ struct CacheGuard : private boost::noncopyable { + using Mutex = std::timed_mutex; /// struct is used (not keyword `using`) to make CacheGuard::Lock non-interchangable with other guards locks /// so, we wouldn't be able to pass CacheGuard::Lock to a function which accepts KeyGuard::Lock, for example - struct Lock : public std::unique_lock + struct Lock : public std::unique_lock { - using Base = std::unique_lock; + using Base = std::unique_lock; using Base::Base; }; Lock lock() { return Lock(mutex); } + Lock tryLock() { return Lock(mutex, std::try_to_lock); } - std::mutex mutex; + + Lock tryLockFor(const std::chrono::milliseconds & acquire_timeout) + { + return Lock(mutex, std::chrono::duration(acquire_timeout)); + } + +private: + Mutex mutex; }; /** From a7350299396d5ba3f2322584195554a7d946562f Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Mar 2024 12:50:54 +0000 Subject: [PATCH 0213/1081] Fix tests --- src/Common/DateLUTImpl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/DateLUTImpl.h b/src/Common/DateLUTImpl.h index 4087e77d588..082127e717c 100644 --- a/src/Common/DateLUTImpl.h +++ b/src/Common/DateLUTImpl.h @@ -255,7 +255,7 @@ private: static LUTIndex toLUTIndex(ExtendedDayNum d) { - return normalizeLUTIndex(static_cast(d) + daynum_offset_epoch); + return normalizeLUTIndex(static_cast(d + daynum_offset_epoch)); /// NOLINT } LUTIndex toLUTIndex(Time t) const From 802bae9661a6f22a8c42a6f88f9816e3497d9355 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Mar 2024 12:54:34 +0000 Subject: [PATCH 0214/1081] GCC --> clang pragmas --- base/base/coverage.cpp | 2 +- base/base/sort.h | 6 +++--- programs/client/Client.cpp | 4 ++-- programs/copier/ClusterCopierApp.cpp | 4 ++-- programs/extract-from-config/ExtractFromConfig.cpp | 4 ++-- programs/format/Format.cpp | 4 ++-- programs/local/LocalServer.cpp | 4 ++-- programs/obfuscator/Obfuscator.cpp | 4 ++-- src/Common/SSH/Wrappers.cpp | 8 ++++---- src/Functions/GatherUtils/Sources.h | 8 ++++---- 10 files changed, 24 insertions(+), 24 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index 99b897c4571..d96b3ea1e9a 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -1,7 +1,7 @@ #include "coverage.h" #include -#pragma GCC diagnostic ignored "-Wreserved-identifier" +#pragma clang diagnostic ignored "-Wreserved-identifier" /// WITH_COVERAGE enables the default implementation of code coverage, diff --git a/base/base/sort.h b/base/base/sort.h index 99bf8a0830e..e46c388d185 100644 --- a/base/base/sort.h +++ b/base/base/sort.h @@ -59,8 +59,8 @@ using ComparatorWrapper = Comparator; #endif -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wold-style-cast" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wold-style-cast" #include @@ -115,7 +115,7 @@ void partial_sort(RandomIt first, RandomIt middle, RandomIt last) ::partial_sort(first, middle, last, comparator()); } -#pragma GCC diagnostic pop +#pragma clang diagnostic pop template void sort(RandomIt first, RandomIt last, Compare compare) diff --git a/programs/client/Client.cpp b/programs/client/Client.cpp index a2bd6b6016a..1c7e57dac76 100644 --- a/programs/client/Client.cpp +++ b/programs/client/Client.cpp @@ -1377,8 +1377,8 @@ void Client::readArguments( } -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseClient(int argc, char ** argv) { diff --git a/programs/copier/ClusterCopierApp.cpp b/programs/copier/ClusterCopierApp.cpp index fdf07dec61a..ed748a17a55 100644 --- a/programs/copier/ClusterCopierApp.cpp +++ b/programs/copier/ClusterCopierApp.cpp @@ -232,8 +232,8 @@ int ClusterCopierApp::main(const std::vector &) } -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseClusterCopier(int argc, char ** argv) { diff --git a/programs/extract-from-config/ExtractFromConfig.cpp b/programs/extract-from-config/ExtractFromConfig.cpp index 56041ee382f..61d451664e3 100644 --- a/programs/extract-from-config/ExtractFromConfig.cpp +++ b/programs/extract-from-config/ExtractFromConfig.cpp @@ -109,8 +109,8 @@ static std::vector extractFromConfig( return {configuration->getString(key)}; } -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseExtractFromConfig(int argc, char ** argv) { diff --git a/programs/format/Format.cpp b/programs/format/Format.cpp index a1c51565ae3..c92106e2f90 100644 --- a/programs/format/Format.cpp +++ b/programs/format/Format.cpp @@ -70,8 +70,8 @@ void skipSpacesAndComments(const char*& pos, const char* end, bool print_comment } -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wmissing-declarations" extern const char * auto_time_zones[]; diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 68f0e52ce08..99639d5e604 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -944,8 +944,8 @@ void LocalServer::readArguments(int argc, char ** argv, Arguments & common_argum } -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wunused-function" +#pragma clang diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseLocal(int argc, char ** argv) { diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 242e995e466..317d93aaf0c 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1204,8 +1204,8 @@ public: } -#pragma GCC diagnostic ignored "-Wunused-function" -#pragma GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wunused-function" +#clang GCC diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseObfuscator(int argc, char ** argv) try diff --git a/src/Common/SSH/Wrappers.cpp b/src/Common/SSH/Wrappers.cpp index 463338dbe3f..a9b9f758c6e 100644 --- a/src/Common/SSH/Wrappers.cpp +++ b/src/Common/SSH/Wrappers.cpp @@ -2,13 +2,13 @@ # if USE_SSH # include -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wreserved-macro-identifier" -# pragma GCC diagnostic ignored "-Wreserved-identifier" +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wreserved-macro-identifier" +# pragma clang diagnostic ignored "-Wreserved-identifier" # include -# pragma GCC diagnostic pop +# pragma clang diagnostic pop namespace { diff --git a/src/Functions/GatherUtils/Sources.h b/src/Functions/GatherUtils/Sources.h index 222f9f19168..80fb9ce3900 100644 --- a/src/Functions/GatherUtils/Sources.h +++ b/src/Functions/GatherUtils/Sources.h @@ -140,9 +140,9 @@ struct NumericArraySource : public ArraySourceImpl> /// The methods can be virtual or not depending on the template parameter. See IStringSource. -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wsuggest-override" -#pragma GCC diagnostic ignored "-Wsuggest-destructor-override" +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wsuggest-override" +#pragma clang diagnostic ignored "-Wsuggest-destructor-override" template struct ConstSource : public Base @@ -231,7 +231,7 @@ struct ConstSource : public Base } }; -#pragma GCC diagnostic pop +#pragma clang diagnostic pop struct StringSource { From 19d8256fa83a4e8353dcad372067085ec8f0828d Mon Sep 17 00:00:00 2001 From: Kseniia Sumarokova <54203879+kssenii@users.noreply.github.com> Date: Mon, 11 Mar 2024 14:44:19 +0100 Subject: [PATCH 0215/1081] Update test.py --- tests/integration/test_backup_restore_s3/test.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/test_backup_restore_s3/test.py b/tests/integration/test_backup_restore_s3/test.py index f3f4837c317..d65fc1f09d6 100644 --- a/tests/integration/test_backup_restore_s3/test.py +++ b/tests/integration/test_backup_restore_s3/test.py @@ -130,11 +130,11 @@ def check_system_tables(backup_query_id=None): if disk ] expected_disks = ( - ("default", "local", "None", "None"), - ("disk_s3", "object_storage", "s3", "local"), - ("disk_s3_cache", "object_storage", "s3", "local"), - ("disk_s3_other_bucket", "object_storage", "s3", "local"), - ("disk_s3_plain", "object_storage", "s3", "plain"), + ("default", "Local", "None", "None"), + ("disk_s3", "ObjectStorage", "S3", "Local"), + ("disk_s3_cache", "ObjectStorage", "S3", "Local"), + ("disk_s3_other_bucket", "ObjectStorage", "S3", "Local"), + ("disk_s3_plain", "ObjectStorage", "S3", "Plain"), ) assert len(expected_disks) == len(disks) for expected_disk in expected_disks: From 9bada70f45654495a30e394d94a374a862c24fb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 11 Mar 2024 14:52:32 +0100 Subject: [PATCH 0216/1081] Remove a bunch of transitive dependencies --- src/Backups/BackupCoordinationRemote.cpp | 2 ++ src/Formats/ReadSchemaUtils.cpp | 9 +++++---- src/Interpreters/DatabaseCatalog.h | 7 +++---- src/Interpreters/GraceHashJoin.cpp | 15 ++++++--------- src/Interpreters/TemporaryDataOnDisk.h | 6 +++--- src/Planner/PlannerExpressionAnalysis.cpp | 2 ++ src/Processors/QueryPlan/AggregatingStep.cpp | 1 + src/Processors/QueryPlan/CubeStep.cpp | 7 ++++--- src/Processors/QueryPlan/SortingStep.cpp | 1 + 9 files changed, 27 insertions(+), 23 deletions(-) diff --git a/src/Backups/BackupCoordinationRemote.cpp b/src/Backups/BackupCoordinationRemote.cpp index 9c509858b2a..ec652f20069 100644 --- a/src/Backups/BackupCoordinationRemote.cpp +++ b/src/Backups/BackupCoordinationRemote.cpp @@ -14,6 +14,8 @@ #include #include +#include + namespace DB { diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 5badf4301bf..b05b768899b 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -1,10 +1,11 @@ #include -#include -#include -#include -#include #include #include +#include +#include +#include +#include +#include namespace DB { diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h index 4fe114cc493..6f05a3cea0f 100644 --- a/src/Interpreters/DatabaseCatalog.h +++ b/src/Interpreters/DatabaseCatalog.h @@ -1,15 +1,14 @@ #pragma once #include +#include +#include #include #include -#include #include #include -#include "Common/NamePrompter.h" +#include #include -#include "Storages/IStorage.h" -#include "Databases/IDatabase.h" #include #include diff --git a/src/Interpreters/GraceHashJoin.cpp b/src/Interpreters/GraceHashJoin.cpp index 5fb92a68a29..53d1f48c291 100644 --- a/src/Interpreters/GraceHashJoin.cpp +++ b/src/Interpreters/GraceHashJoin.cpp @@ -1,21 +1,18 @@ +#include +#include +#include +#include #include #include #include - -#include #include - -#include +#include #include #include #include -#include -#include - -#include - #include +#include namespace CurrentMetrics diff --git a/src/Interpreters/TemporaryDataOnDisk.h b/src/Interpreters/TemporaryDataOnDisk.h index e57d9130369..8b0649be1b1 100644 --- a/src/Interpreters/TemporaryDataOnDisk.h +++ b/src/Interpreters/TemporaryDataOnDisk.h @@ -2,11 +2,11 @@ #include -#include -#include +#include #include -#include +#include #include +#include namespace CurrentMetrics diff --git a/src/Planner/PlannerExpressionAnalysis.cpp b/src/Planner/PlannerExpressionAnalysis.cpp index 52001eb27c5..30d90a68072 100644 --- a/src/Planner/PlannerExpressionAnalysis.cpp +++ b/src/Planner/PlannerExpressionAnalysis.cpp @@ -3,6 +3,8 @@ #include #include +#include + #include #include #include diff --git a/src/Processors/QueryPlan/AggregatingStep.cpp b/src/Processors/QueryPlan/AggregatingStep.cpp index f374a7b7b10..a76bacdd97b 100644 --- a/src/Processors/QueryPlan/AggregatingStep.cpp +++ b/src/Processors/QueryPlan/AggregatingStep.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include diff --git a/src/Processors/QueryPlan/CubeStep.cpp b/src/Processors/QueryPlan/CubeStep.cpp index 0c632c346c7..d010a3327a6 100644 --- a/src/Processors/QueryPlan/CubeStep.cpp +++ b/src/Processors/QueryPlan/CubeStep.cpp @@ -1,10 +1,11 @@ +#include +#include +#include +#include #include #include #include -#include #include -#include -#include namespace DB { diff --git a/src/Processors/QueryPlan/SortingStep.cpp b/src/Processors/QueryPlan/SortingStep.cpp index 641b9036d4c..d0491cb4b82 100644 --- a/src/Processors/QueryPlan/SortingStep.cpp +++ b/src/Processors/QueryPlan/SortingStep.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include From 91de3825171eefb8f828c2907181b6a5e6b0f017 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A1nos=20Benjamin=20Antal?= Date: Mon, 11 Mar 2024 14:00:01 +0000 Subject: [PATCH 0217/1081] Spit `DictionaryPipelineExecutor` into separate file --- src/Dictionaries/CacheDictionary.cpp | 2 +- .../DictionaryPipelineExecutor.cpp | 42 +++++++++++++++++++ src/Dictionaries/DictionaryPipelineExecutor.h | 27 ++++++++++++ src/Dictionaries/DictionarySourceHelpers.cpp | 29 ------------- src/Dictionaries/DictionarySourceHelpers.h | 17 -------- src/Dictionaries/FlatDictionary.cpp | 2 +- src/Dictionaries/HashedArrayDictionary.cpp | 1 + src/Dictionaries/HashedDictionary.h | 2 +- src/Dictionaries/IPAddressDictionary.cpp | 1 + src/Dictionaries/PolygonDictionary.cpp | 3 +- src/Dictionaries/RangeHashedDictionary.h | 6 +-- src/Dictionaries/RegExpTreeDictionary.cpp | 1 + .../registerRangeHashedDictionary.cpp | 5 ++- 13 files changed, 82 insertions(+), 56 deletions(-) create mode 100644 src/Dictionaries/DictionaryPipelineExecutor.cpp create mode 100644 src/Dictionaries/DictionaryPipelineExecutor.h diff --git a/src/Dictionaries/CacheDictionary.cpp b/src/Dictionaries/CacheDictionary.cpp index 8444042db9e..b136d5ebd71 100644 --- a/src/Dictionaries/CacheDictionary.cpp +++ b/src/Dictionaries/CacheDictionary.cpp @@ -9,7 +9,7 @@ #include #include -#include +#include #include #include diff --git a/src/Dictionaries/DictionaryPipelineExecutor.cpp b/src/Dictionaries/DictionaryPipelineExecutor.cpp new file mode 100644 index 00000000000..30d1ab95f53 --- /dev/null +++ b/src/Dictionaries/DictionaryPipelineExecutor.cpp @@ -0,0 +1,42 @@ +#include + +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + +DictionaryPipelineExecutor::DictionaryPipelineExecutor(QueryPipeline & pipeline_, bool async) + : async_executor(async ? std::make_unique(pipeline_) : nullptr) + , executor(async ? nullptr : std::make_unique(pipeline_)) +{ +} + +bool DictionaryPipelineExecutor::pull(Block & block) +{ + if (async_executor) + { + while (true) + { + bool has_data = async_executor->pull(block); + if (has_data && !block) + continue; + return has_data; + } + } + else if (executor) + return executor->pull(block); + else + throw Exception(ErrorCodes::LOGICAL_ERROR, "DictionaryPipelineExecutor is not initialized"); +} + +DictionaryPipelineExecutor::~DictionaryPipelineExecutor() = default; + +} diff --git a/src/Dictionaries/DictionaryPipelineExecutor.h b/src/Dictionaries/DictionaryPipelineExecutor.h new file mode 100644 index 00000000000..601213e5039 --- /dev/null +++ b/src/Dictionaries/DictionaryPipelineExecutor.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +namespace DB +{ + +class Block; +class QueryPipeline; +class PullingAsyncPipelineExecutor; +class PullingPipelineExecutor; + +/// Wrapper for `Pulling(Async)PipelineExecutor` to dynamically dispatch calls to the right executor +class DictionaryPipelineExecutor +{ +public: + DictionaryPipelineExecutor(QueryPipeline & pipeline_, bool async); + bool pull(Block & block); + + ~DictionaryPipelineExecutor(); + +private: + std::unique_ptr async_executor; + std::unique_ptr executor; +}; + +} diff --git a/src/Dictionaries/DictionarySourceHelpers.cpp b/src/Dictionaries/DictionarySourceHelpers.cpp index d9a4d9ccbcf..f0e1bc4109a 100644 --- a/src/Dictionaries/DictionarySourceHelpers.cpp +++ b/src/Dictionaries/DictionarySourceHelpers.cpp @@ -9,15 +9,11 @@ #include #include -#include -#include - namespace DB { namespace ErrorCodes { - extern const int LOGICAL_ERROR; extern const int SIZES_OF_COLUMNS_DOESNT_MATCH; } @@ -135,29 +131,4 @@ String TransformWithAdditionalColumns::getName() const return "TransformWithAdditionalColumns"; } -DictionaryPipelineExecutor::DictionaryPipelineExecutor(QueryPipeline & pipeline_, bool async) - : async_executor(async ? std::make_unique(pipeline_) : nullptr) - , executor(async ? nullptr : std::make_unique(pipeline_)) -{} - -bool DictionaryPipelineExecutor::pull(Block & block) -{ - if (async_executor) - { - while (true) - { - bool has_data = async_executor->pull(block); - if (has_data && !block) - continue; - return has_data; - } - } - else if (executor) - return executor->pull(block); - else - throw Exception(ErrorCodes::LOGICAL_ERROR, "DictionaryPipelineExecutor is not initialized"); -} - -DictionaryPipelineExecutor::~DictionaryPipelineExecutor() = default; - } diff --git a/src/Dictionaries/DictionarySourceHelpers.h b/src/Dictionaries/DictionarySourceHelpers.h index a545b5cdac7..39c6e7b3c42 100644 --- a/src/Dictionaries/DictionarySourceHelpers.h +++ b/src/Dictionaries/DictionarySourceHelpers.h @@ -16,10 +16,6 @@ namespace DB struct DictionaryStructure; class SettingsChanges; -class PullingPipelineExecutor; -class PullingAsyncPipelineExecutor; -class QueryPipeline; - /// For simple key Block blockForIds( @@ -55,17 +51,4 @@ private: size_t current_range_index = 0; }; -/// Wrapper for `Pulling(Async)PipelineExecutor` to dynamically dispatch calls to the right executor -class DictionaryPipelineExecutor -{ -public: - DictionaryPipelineExecutor(QueryPipeline & pipeline_, bool async); - bool pull(Block & block); - - ~DictionaryPipelineExecutor(); -private: - std::unique_ptr async_executor; - std::unique_ptr executor; -}; - } diff --git a/src/Dictionaries/FlatDictionary.cpp b/src/Dictionaries/FlatDictionary.cpp index efb7d0a176c..fc58ff525bd 100644 --- a/src/Dictionaries/FlatDictionary.cpp +++ b/src/Dictionaries/FlatDictionary.cpp @@ -15,7 +15,7 @@ #include #include -#include +#include #include #include diff --git a/src/Dictionaries/HashedArrayDictionary.cpp b/src/Dictionaries/HashedArrayDictionary.cpp index d09f402143e..2420c07277c 100644 --- a/src/Dictionaries/HashedArrayDictionary.cpp +++ b/src/Dictionaries/HashedArrayDictionary.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/src/Dictionaries/HashedDictionary.h b/src/Dictionaries/HashedDictionary.h index b3b8cc56868..46a0af487f5 100644 --- a/src/Dictionaries/HashedDictionary.h +++ b/src/Dictionaries/HashedDictionary.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/Dictionaries/IPAddressDictionary.cpp b/src/Dictionaries/IPAddressDictionary.cpp index e1c9572e607..1bc6d16c932 100644 --- a/src/Dictionaries/IPAddressDictionary.cpp +++ b/src/Dictionaries/IPAddressDictionary.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include diff --git a/src/Dictionaries/PolygonDictionary.cpp b/src/Dictionaries/PolygonDictionary.cpp index 485b48d9d81..1456a0db750 100644 --- a/src/Dictionaries/PolygonDictionary.cpp +++ b/src/Dictionaries/PolygonDictionary.cpp @@ -1,6 +1,5 @@ #include "PolygonDictionary.h" -#include #include #include @@ -15,7 +14,7 @@ #include #include #include -#include +#include namespace DB diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h index 1a6ee7e81d4..509b991b30c 100644 --- a/src/Dictionaries/RangeHashedDictionary.h +++ b/src/Dictionaries/RangeHashedDictionary.h @@ -15,6 +15,8 @@ #include #include #include +#include +#include #include #include @@ -29,10 +31,6 @@ #include #include -#include -#include -#include - namespace DB { diff --git a/src/Dictionaries/RegExpTreeDictionary.cpp b/src/Dictionaries/RegExpTreeDictionary.cpp index 4d82aa9ca0e..8930074bbe0 100644 --- a/src/Dictionaries/RegExpTreeDictionary.cpp +++ b/src/Dictionaries/RegExpTreeDictionary.cpp @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/src/Dictionaries/registerRangeHashedDictionary.cpp b/src/Dictionaries/registerRangeHashedDictionary.cpp index 4e20abfdb79..8123b811198 100644 --- a/src/Dictionaries/registerRangeHashedDictionary.cpp +++ b/src/Dictionaries/registerRangeHashedDictionary.cpp @@ -1,5 +1,8 @@ -#include "RangeHashedDictionary.h" +#include + #include +#include +#include #include namespace DB From 8b5ccb4735365ef81af4debcc3180f296452268d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 11 Mar 2024 15:53:46 +0100 Subject: [PATCH 0218/1081] Remove one template --- src/Interpreters/Aggregator.cpp | 65 +++++++++++++-------------------- src/Interpreters/Aggregator.h | 4 +- 2 files changed, 28 insertions(+), 41 deletions(-) diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 80a98683867..69625dbd57d 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -2609,8 +2609,9 @@ void NO_INLINE Aggregator::mergeDataNullKey( } } -template -void NO_INLINE Aggregator::mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena) const +template +void NO_INLINE +Aggregator::mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions [[maybe_unused]]) const { if constexpr (Method::low_cardinality_optimization || Method::one_key_nullable_optimization) mergeDataNullKey(table_dst, table_src, arena); @@ -2637,7 +2638,7 @@ void NO_INLINE Aggregator::mergeDataImpl(Table & table_dst, Table & table_src, A table_src.clearAndShrink(); #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) + if (use_compiled_functions) { const auto & compiled_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions; compiled_functions.merge_aggregate_states_function(dst_places.data(), src_places.data(), dst_places.size()); @@ -2787,26 +2788,16 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl( if (!no_more_keys) { + bool use_compiled_functions = false; #if USE_EMBEDDED_COMPILER - if (compiled_aggregate_functions_holder) - { - if (prefetch) - mergeDataImpl( - getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool); - else - mergeDataImpl( - getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool); - } - else + use_compiled_functions = compiled_aggregate_functions_holder != nullptr; #endif - { - if (prefetch) - mergeDataImpl( - getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool); - else - mergeDataImpl( - getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool); - } + if (prefetch) + mergeDataImpl( + getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool, use_compiled_functions); + else + mergeDataImpl( + getDataVariant(*res).data, getDataVariant(current).data, res->aggregates_pool, use_compiled_functions); } else if (res->without_key) { @@ -2851,26 +2842,22 @@ void NO_INLINE Aggregator::mergeBucketImpl( return; AggregatedDataVariants & current = *data[result_num]; + bool use_compiled_functions = false; #if USE_EMBEDDED_COMPILER - if (compiled_aggregate_functions_holder) - { - if (prefetch) - mergeDataImpl( - getDataVariant(*res).data.impls[bucket], getDataVariant(current).data.impls[bucket], arena); - else - mergeDataImpl( - getDataVariant(*res).data.impls[bucket], getDataVariant(current).data.impls[bucket], arena); - } - else + use_compiled_functions = compiled_aggregate_functions_holder != nullptr; #endif - { - if (prefetch) - mergeDataImpl( - getDataVariant(*res).data.impls[bucket], getDataVariant(current).data.impls[bucket], arena); - else - mergeDataImpl( - getDataVariant(*res).data.impls[bucket], getDataVariant(current).data.impls[bucket], arena); - } + if (prefetch) + mergeDataImpl( + getDataVariant(*res).data.impls[bucket], + getDataVariant(current).data.impls[bucket], + arena, + use_compiled_functions); + else + mergeDataImpl( + getDataVariant(*res).data.impls[bucket], + getDataVariant(current).data.impls[bucket], + arena, + use_compiled_functions); } } diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index 375b8986101..67e82cdd784 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -1429,8 +1429,8 @@ private: Arena * arena) const; /// Merge data from hash table `src` into `dst`. - template - void mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena) const; + template + void mergeDataImpl(Table & table_dst, Table & table_src, Arena * arena, bool use_compiled_functions) const; /// Merge data from hash table `src` into `dst`, but only for keys that already exist in dst. In other cases, merge the data into `overflows`. template From 724cc903afb9283a8369a62a836d04eceae42e57 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 11 Mar 2024 15:56:02 +0100 Subject: [PATCH 0219/1081] Restart CI --- tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh b/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh index 1bf21dfc53b..6cd5c3b486c 100755 --- a/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh +++ b/tests/queries/0_stateless/03001_parallel_parsing_deadlock.sh @@ -9,4 +9,3 @@ DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.csv $CLICKHOUSE_LOCAL -q "select number > 1000000 ? 'error' : toString(number) from numbers(2000000) format CSV" > $DATA_FILE $CLICKHOUSE_LOCAL -q "select * from file($DATA_FILE, CSV, 'x UInt64') format Null settings input_format_allow_errors_ratio=1" rm $DATA_FILE - From ecc30448baa1c6283f3f0f13c83cfd1bf4428b9b Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 11 Mar 2024 15:26:29 +0000 Subject: [PATCH 0220/1081] Fix filtering when optimize_use_implicit_projections=1 --- .../optimizeUseAggregateProjection.cpp | 4 ++ src/Storages/VirtualColumnUtils.cpp | 2 +- src/Storages/VirtualColumnUtils.h | 3 + ...ions_non_deterministoc_functions.reference | 55 +++++++++++++++++++ ...rojections_non_deterministoc_functions.sql | 28 ++++++++++ 5 files changed, 91 insertions(+), 1 deletion(-) create mode 100644 tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.reference create mode 100644 tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.sql diff --git a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp index 91f4213ff43..b40fea47b3c 100644 --- a/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp +++ b/src/Processors/QueryPlan/Optimizations/optimizeUseAggregateProjection.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -464,6 +465,9 @@ AggregateProjectionCandidates getAggregateProjectionCandidates( // LOG_TRACE(getLogger("optimizeUseProjections"), "Query DAG: {}", dag.dag->dumpDAG()); candidates.has_filter = dag.filter_node; + /// We can't use minmax projection if filter has non-deterministic functions. + if (dag.filter_node && !VirtualColumnUtils::isDeterministicInScopeOfQuery(dag.filter_node)) + can_use_minmax_projection = false; if (can_use_minmax_projection) { diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 897090223d6..c3ac27903c9 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -238,7 +238,7 @@ static bool canEvaluateSubtree(const ActionsDAG::Node * node, const Block & allo return true; } -static bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node) +bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node) { for (const auto * child : node->children) { diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index b5526fc5c7f..83494872cac 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -25,6 +25,9 @@ void filterBlockWithPredicate(const ActionsDAG::Node * predicate, Block & block, /// Just filters block. Block should contain all the required columns. void filterBlockWithDAG(ActionsDAGPtr dag, Block & block, ContextPtr context); +/// Recursively checks if all functions used in DAG are deterministic in scope of query. +bool isDeterministicInScopeOfQuery(const ActionsDAG::Node * node); + /// Extract a part of predicate that can be evaluated using only columns from input_names. ActionsDAGPtr splitFilterDagForAllowedInputs(const ActionsDAG::Node * predicate, const Block * allowed_inputs); diff --git a/tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.reference b/tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.reference new file mode 100644 index 00000000000..8233925d609 --- /dev/null +++ b/tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.reference @@ -0,0 +1,55 @@ +-- count +100000 all_10_10_0 +100000 all_1_1_0 +100000 all_2_2_0 +100000 all_3_3_0 +100000 all_4_4_0 +100000 all_5_5_0 +100000 all_6_6_0 +100000 all_7_7_0 +100000 all_8_8_0 +100000 all_9_9_0 +-- rand()%2=0: +1 all_10_10_0 +1 all_1_1_0 +1 all_2_2_0 +1 all_3_3_0 +1 all_4_4_0 +1 all_5_5_0 +1 all_6_6_0 +1 all_7_7_0 +1 all_8_8_0 +1 all_9_9_0 +-- optimize_use_implicit_projections=0 +1 all_10_10_0 +1 all_1_1_0 +1 all_2_2_0 +1 all_3_3_0 +1 all_4_4_0 +1 all_5_5_0 +1 all_6_6_0 +1 all_7_7_0 +1 all_8_8_0 +1 all_9_9_0 +-- optimize_trivial_count_query=0 +1 all_10_10_0 +1 all_1_1_0 +1 all_2_2_0 +1 all_3_3_0 +1 all_4_4_0 +1 all_5_5_0 +1 all_6_6_0 +1 all_7_7_0 +1 all_8_8_0 +1 all_9_9_0 +-- optimize_trivial_count_query=0, optimize_use_implicit_projections=0 +1 all_10_10_0 +1 all_1_1_0 +1 all_2_2_0 +1 all_3_3_0 +1 all_4_4_0 +1 all_5_5_0 +1 all_6_6_0 +1 all_7_7_0 +1 all_8_8_0 +1 all_9_9_0 diff --git a/tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.sql b/tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.sql new file mode 100644 index 00000000000..3be9bc3982a --- /dev/null +++ b/tests/queries/0_stateless/03008_filter_projections_non_deterministoc_functions.sql @@ -0,0 +1,28 @@ +create table test (number UInt64) engine=MergeTree order by number; +system stop merges test; +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); +INSERT INTO test select number from numbers(100000); + +select '-- count'; +SELECT count(), _part FROM test GROUP BY _part ORDER BY _part; + +select '-- rand()%2=0:'; +SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(1)%2=1 GROUP BY _part ORDER BY _part; + +select '-- optimize_use_implicit_projections=0'; +SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(2)%2=1 GROUP BY _part ORDER BY _part settings optimize_use_implicit_projections=0; + +select '-- optimize_trivial_count_query=0'; +SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(3)%2=1 GROUP BY _part ORDER BY _part settings optimize_trivial_count_query=0; + +select '-- optimize_trivial_count_query=0, optimize_use_implicit_projections=0'; +SELECT count() > 0 AND count() < 100000, _part FROM test WHERE rand(4)%2=1 GROUP BY _part ORDER BY _part settings optimize_trivial_count_query=0,optimize_use_implicit_projections=0; + From 879f7f2f8c862aae51ddc5a8faebb8d07b5d4493 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 11 Mar 2024 16:28:25 +0100 Subject: [PATCH 0221/1081] Remove more templates for JIT --- src/Interpreters/Aggregator.cpp | 185 ++++++++++++++------------------ src/Interpreters/Aggregator.h | 21 ++-- 2 files changed, 96 insertions(+), 110 deletions(-) diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index 69625dbd57d..c7ce3e46446 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -1111,6 +1111,7 @@ void NO_INLINE Aggregator::executeImpl( bool all_keys_are_const, AggregateDataPtr overflow_row) const { + bool use_compiled_functions = false; if (!no_more_keys) { /// Prefetching doesn't make sense for small hash tables, because they fit in caches entirely. @@ -1118,33 +1119,47 @@ void NO_INLINE Aggregator::executeImpl( && (method.data.getBufferSizeInBytes() > min_bytes_for_prefetch); #if USE_EMBEDDED_COMPILER - if (compiled_aggregate_functions_holder && !hasSparseArguments(aggregate_instructions)) - { - if (prefetch) - executeImplBatch( - method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row); - else - executeImplBatch( - method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row); - } - else + use_compiled_functions = compiled_aggregate_functions_holder && !hasSparseArguments(aggregate_instructions); #endif - { - if (prefetch) - executeImplBatch( - method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row); - else - executeImplBatch( - method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row); - } + if (prefetch) + executeImplBatch( + method, + state, + aggregates_pool, + row_begin, + row_end, + aggregate_instructions, + all_keys_are_const, + use_compiled_functions, + overflow_row); + else + executeImplBatch( + method, + state, + aggregates_pool, + row_begin, + row_end, + aggregate_instructions, + all_keys_are_const, + use_compiled_functions, + overflow_row); } else { - executeImplBatch(method, state, aggregates_pool, row_begin, row_end, aggregate_instructions, all_keys_are_const, overflow_row); + executeImplBatch( + method, + state, + aggregates_pool, + row_begin, + row_end, + aggregate_instructions, + all_keys_are_const, + use_compiled_functions, + overflow_row); } } -template +template void NO_INLINE Aggregator::executeImplBatch( Method & method, State & state, @@ -1153,6 +1168,7 @@ void NO_INLINE Aggregator::executeImplBatch( size_t row_end, AggregateFunctionInstruction * aggregate_instructions, bool all_keys_are_const, + bool use_compiled_functions [[maybe_unused]], AggregateDataPtr overflow_row) const { using KeyHolder = decltype(state.getKeyHolder(0, std::declval())); @@ -1284,7 +1300,7 @@ void NO_INLINE Aggregator::executeImplBatch( aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) + if (use_compiled_functions) { const auto & compiled_aggregate_functions = compiled_aggregate_functions_holder->compiled_aggregate_functions; compiled_aggregate_functions.create_aggregate_states_function(aggregate_data); @@ -1293,20 +1309,6 @@ void NO_INLINE Aggregator::executeImplBatch( static constexpr bool skip_compiled_aggregate_functions = true; createAggregateStates(aggregate_data); } - -#if defined(MEMORY_SANITIZER) - - /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place. - for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index) - { - if (!is_aggregate_function_compiled[aggregate_function_index]) - continue; - - auto aggregate_data_with_offset = aggregate_data + offsets_of_aggregate_states[aggregate_function_index]; - auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData(); - __msan_unpoison(aggregate_data_with_offset, data_size); - } -#endif } else #endif @@ -1339,7 +1341,7 @@ void NO_INLINE Aggregator::executeImplBatch( } #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) + if (use_compiled_functions) { std::vector columns_data; @@ -1372,9 +1374,8 @@ void NO_INLINE Aggregator::executeImplBatch( for (size_t i = 0; i < aggregate_functions.size(); ++i) { #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) - if (is_aggregate_function_compiled[i]) - continue; + if (use_compiled_functions && is_aggregate_function_compiled[i]) + continue; #endif AggregateFunctionInstruction * inst = aggregate_instructions + i; @@ -1387,18 +1388,19 @@ void NO_INLINE Aggregator::executeImplBatch( } -template void NO_INLINE Aggregator::executeWithoutKeyImpl( AggregatedDataWithoutKey & res, - size_t row_begin, size_t row_end, + size_t row_begin, + size_t row_end, AggregateFunctionInstruction * aggregate_instructions, - Arena * arena) const + Arena * arena, + bool use_compiled_functions [[maybe_unused]]) const { if (row_begin == row_end) return; #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) + if (use_compiled_functions) { std::vector columns_data; @@ -1418,20 +1420,6 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl( auto add_into_aggregate_states_function_single_place = compiled_aggregate_functions_holder->compiled_aggregate_functions.add_into_aggregate_states_function_single_place; add_into_aggregate_states_function_single_place(row_begin, row_end, columns_data.data(), res); - -#if defined(MEMORY_SANITIZER) - - /// We compile only functions that do not allocate some data in Arena. Only store necessary state in AggregateData place. - for (size_t aggregate_function_index = 0; aggregate_function_index < aggregate_functions.size(); ++aggregate_function_index) - { - if (!is_aggregate_function_compiled[aggregate_function_index]) - continue; - - auto aggregate_data_with_offset = res + offsets_of_aggregate_states[aggregate_function_index]; - auto data_size = params.aggregates[aggregate_function_index].function->sizeOfData(); - __msan_unpoison(aggregate_data_with_offset, data_size); - } -#endif } #endif @@ -1439,13 +1427,10 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl( for (size_t i = 0; i < aggregate_functions.size(); ++i) { AggregateFunctionInstruction * inst = aggregate_instructions + i; - #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) - if (is_aggregate_function_compiled[i]) - continue; + if (use_compiled_functions && is_aggregate_function_compiled[i]) + continue; #endif - addBatchSinglePlace(row_begin, row_end, inst, res + inst->state_offset, arena); } } @@ -1704,16 +1689,14 @@ bool Aggregator::executeOnBlock(Columns columns, if (result.type == AggregatedDataVariants::Type::without_key) { /// TODO: Enable compilation after investigation -// #if USE_EMBEDDED_COMPILER -// if (compiled_aggregate_functions_holder) -// { -// executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool); -// } -// else -// #endif - { - executeWithoutKeyImpl(result.without_key, row_begin, row_end, aggregate_functions_instructions.data(), result.aggregates_pool); - } + bool use_compiled_functions = false; + executeWithoutKeyImpl( + result.without_key, + row_begin, + row_end, + aggregate_functions_instructions.data(), + result.aggregates_pool, + use_compiled_functions); } else { @@ -1965,19 +1948,13 @@ Aggregator::convertToBlockImpl(Method & method, Table & data, Arena * arena, Are ConvertToBlockRes res; + bool use_compiled_functions = false; if (final) { #if USE_EMBEDDED_COMPILER - if (compiled_aggregate_functions_holder) - { - static constexpr bool use_compiled_functions = !Method::low_cardinality_optimization; - res = convertToBlockImplFinal(method, data, arena, aggregates_pools, rows); - } - else + use_compiled_functions = compiled_aggregate_functions_holder != nullptr && !Method::low_cardinality_optimization; #endif - { - res = convertToBlockImplFinal(method, data, arena, aggregates_pools, rows); - } + res = convertToBlockImplFinal(method, data, arena, aggregates_pools, use_compiled_functions, rows); } else { @@ -2059,8 +2036,12 @@ inline void Aggregator::insertAggregatesIntoColumns(Mapped & mapped, MutableColu } -template -Block Aggregator::insertResultsIntoColumns(PaddedPODArray & places, OutputBlockColumns && out_cols, Arena * arena, bool has_null_key_data [[maybe_unused]]) const +Block Aggregator::insertResultsIntoColumns( + PaddedPODArray & places, + OutputBlockColumns && out_cols, + Arena * arena, + bool has_null_key_data [[maybe_unused]], + bool use_compiled_functions [[maybe_unused]]) const { std::exception_ptr exception; size_t aggregate_functions_destroy_index = 0; @@ -2068,7 +2049,7 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray & pl try { #if USE_EMBEDDED_COMPILER - if constexpr (use_compiled_functions) + if (use_compiled_functions) { /** For JIT compiled functions we need to resize columns before pass them into compiled code. * insert_aggregates_into_columns_function function does not throw exception. @@ -2098,14 +2079,13 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray & pl for (; aggregate_functions_destroy_index < params.aggregates_size;) { - if constexpr (use_compiled_functions) +#if USE_EMBEDDED_COMPILER + if (use_compiled_functions && is_aggregate_function_compiled[aggregate_functions_destroy_index]) { - if (is_aggregate_function_compiled[aggregate_functions_destroy_index]) - { - ++aggregate_functions_destroy_index; - continue; - } + ++aggregate_functions_destroy_index; + continue; } +#endif auto & final_aggregate_column = out_cols.final_aggregate_columns[aggregate_functions_destroy_index]; size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index]; @@ -2127,14 +2107,13 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray & pl for (; aggregate_functions_destroy_index < params.aggregates_size; ++aggregate_functions_destroy_index) { - if constexpr (use_compiled_functions) +#if USE_EMBEDDED_COMPILER + if (use_compiled_functions && is_aggregate_function_compiled[aggregate_functions_destroy_index]) { - if (is_aggregate_function_compiled[aggregate_functions_destroy_index]) - { - ++aggregate_functions_destroy_index; - continue; - } + ++aggregate_functions_destroy_index; + continue; } +#endif size_t offset = offsets_of_aggregate_states[aggregate_functions_destroy_index]; aggregate_functions[aggregate_functions_destroy_index]->destroyBatch(0, places.size(), places.data(), offset); @@ -2146,9 +2125,9 @@ Block Aggregator::insertResultsIntoColumns(PaddedPODArray & pl return finalizeBlock(params, getHeader(/* final */ true), std::move(out_cols), /* final */ true, places.size()); } -template -Aggregator::ConvertToBlockRes NO_INLINE -Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t) const +template +Aggregator::ConvertToBlockRes NO_INLINE Aggregator::convertToBlockImplFinal( + Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool use_compiled_functions [[maybe_unused]], size_t) const { /// +1 for nullKeyData, if `data` doesn't have it - not a problem, just some memory for one excessive row will be preallocated const size_t max_block_size = (return_single_block ? data.size() : std::min(params.max_block_size, data.size())) + 1; @@ -2204,7 +2183,8 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena { if (places.size() >= max_block_size) { - res.emplace_back(insertResultsIntoColumns(places, std::move(out_cols.value()), arena, has_null_key_data)); + res.emplace_back( + insertResultsIntoColumns(places, std::move(out_cols.value()), arena, has_null_key_data, use_compiled_functions)); places.clear(); out_cols.reset(); has_null_key_data = false; @@ -2214,12 +2194,13 @@ Aggregator::convertToBlockImplFinal(Method & method, Table & data, Arena * arena if constexpr (return_single_block) { - return insertResultsIntoColumns(places, std::move(out_cols.value()), arena, has_null_key_data); + return insertResultsIntoColumns(places, std::move(out_cols.value()), arena, has_null_key_data, use_compiled_functions); } else { if (out_cols.has_value()) - res.emplace_back(insertResultsIntoColumns(places, std::move(out_cols.value()), arena, has_null_key_data)); + res.emplace_back( + insertResultsIntoColumns(places, std::move(out_cols.value()), arena, has_null_key_data, use_compiled_functions)); return res; } } diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index 67e82cdd784..d7bbe5950a0 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -1395,7 +1395,7 @@ private: AggregateDataPtr overflow_row) const; /// Specialization for a particular value no_more_keys. - template + template void executeImplBatch( Method & method, State & state, @@ -1404,16 +1404,17 @@ private: size_t row_end, AggregateFunctionInstruction * aggregate_instructions, bool all_keys_are_const, + bool use_compiled_functions, AggregateDataPtr overflow_row) const; /// For case when there are no keys (all aggregate into one row). - template void executeWithoutKeyImpl( AggregatedDataWithoutKey & res, size_t row_begin, size_t row_end, AggregateFunctionInstruction * aggregate_instructions, - Arena * arena) const; + Arena * arena, + bool use_compiled_functions) const; template void writeToTemporaryFileImpl( @@ -1467,12 +1468,16 @@ private: MutableColumns & final_aggregate_columns, Arena * arena) const; - template - Block insertResultsIntoColumns(PaddedPODArray & places, OutputBlockColumns && out_cols, Arena * arena, bool has_null_key_data) const; + Block insertResultsIntoColumns( + PaddedPODArray & places, + OutputBlockColumns && out_cols, + Arena * arena, + bool has_null_key_data, + bool use_compiled_functions) const; - template - ConvertToBlockRes - convertToBlockImplFinal(Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, size_t rows) const; + template + ConvertToBlockRes convertToBlockImplFinal( + Method & method, Table & data, Arena * arena, Arenas & aggregates_pools, bool use_compiled_functions, size_t rows) const; template ConvertToBlockRes From 81b2a1f621d9bd64fde2c4e4f6a83c9b3b0c461a Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Mon, 11 Mar 2024 15:34:02 +0000 Subject: [PATCH 0222/1081] Fix build --- programs/obfuscator/Obfuscator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 317d93aaf0c..b2bf942af4e 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1205,7 +1205,7 @@ public: } #pragma clang diagnostic ignored "-Wunused-function" -#clang GCC diagnostic ignored "-Wmissing-declarations" +#pragma clang diagnostic ignored "-Wmissing-declarations" int mainEntryClickHouseObfuscator(int argc, char ** argv) try From 38f41ee311d0a36d194965e5815489a25c60e449 Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Mar 2024 16:55:30 +0100 Subject: [PATCH 0223/1081] Fix integration test --- tests/integration/test_disk_types/test.py | 10 +++++----- .../test_endpoint_macro_substitution/test.py | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/integration/test_disk_types/test.py b/tests/integration/test_disk_types/test.py index af482b97be3..86579911b3e 100644 --- a/tests/integration/test_disk_types/test.py +++ b/tests/integration/test_disk_types/test.py @@ -3,10 +3,10 @@ from helpers.cluster import ClickHouseCluster from helpers.test_tools import TSV disk_types = { - "default": "local", - "disk_s3": "s3", - "disk_hdfs": "hdfs", - "disk_encrypted": "s3", + "default": "Local", + "disk_s3": "S3", + "disk_hdfs": "HDFS", + "disk_encrypted": "S3", } @@ -55,7 +55,7 @@ def test_different_types(cluster): def test_select_by_type(cluster): node = cluster.instances["node"] for name, disk_type in list(disk_types.items()): - if disk_type != "s3": + if disk_type != "S3": assert ( node.query( "SELECT name FROM system.disks WHERE type='" + disk_type + "'" diff --git a/tests/integration/test_endpoint_macro_substitution/test.py b/tests/integration/test_endpoint_macro_substitution/test.py index 42a8ddbda84..bec3d9de0e3 100644 --- a/tests/integration/test_endpoint_macro_substitution/test.py +++ b/tests/integration/test_endpoint_macro_substitution/test.py @@ -4,10 +4,10 @@ from helpers.test_tools import TSV from pyhdfs import HdfsClient disk_types = { - "default": "local", - "disk_s3": "s3", - "disk_hdfs": "hdfs", - "disk_encrypted": "s3", + "default": "Local", + "disk_s3": "S3", + "disk_hdfs": "HDFS", + "disk_encrypted": "S3", } @@ -63,7 +63,7 @@ def test_select_by_type(cluster): fs = HdfsClient(hosts=cluster.hdfs_ip) for name, disk_type in list(disk_types.items()): - if disk_type != "s3": + if disk_type != "S3": assert ( node.query( "SELECT name FROM system.disks WHERE type='" + disk_type + "'" From 5db08292455fb0c6f47fc0344382ab7cf3508e91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 11 Mar 2024 17:20:53 +0100 Subject: [PATCH 0224/1081] Remove another template --- src/Interpreters/Aggregator.cpp | 55 ++++++++++++++------------------- src/Interpreters/Aggregator.h | 3 +- 2 files changed, 26 insertions(+), 32 deletions(-) diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp index c7ce3e46446..a9578b5540f 100644 --- a/src/Interpreters/Aggregator.cpp +++ b/src/Interpreters/Aggregator.cpp @@ -2906,11 +2906,12 @@ ManyAggregatedDataVariants Aggregator::prepareVariantsToMerge(ManyAggregatedData return non_empty_data; } -template +template void NO_INLINE Aggregator::mergeStreamsImplCase( Arena * aggregates_pool, State & state, Table & data, + bool no_more_keys, AggregateDataPtr overflow_row, size_t row_begin, size_t row_end, @@ -2922,36 +2923,34 @@ void NO_INLINE Aggregator::mergeStreamsImplCase( if (!arena_for_keys) arena_for_keys = aggregates_pool; - for (size_t i = row_begin; i < row_end; ++i) + if (no_more_keys) { - AggregateDataPtr aggregate_data = nullptr; - - if constexpr (!no_more_keys) + for (size_t i = row_begin; i < row_end; i++) { - auto emplace_result = state.emplaceKey(data, i, *arena_for_keys); // NOLINT - if (emplace_result.isInserted()) + auto find_result = state.findKey(data, i, *arena_for_keys); + /// aggregate_date == nullptr means that the new key did not fit in the hash table because of no_more_keys. + AggregateDataPtr value = find_result.isFound() ? find_result.getMapped() : overflow_row; + places[i] = value; + } + } + else + { + for (size_t i = row_begin; i < row_end; i++) + { + auto emplace_result = state.emplaceKey(data, i, *arena_for_keys); + if (!emplace_result.isInserted()) + places[i] = emplace_result.getMapped(); + else { emplace_result.setMapped(nullptr); - aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); + AggregateDataPtr aggregate_data = aggregates_pool->alignedAlloc(total_size_of_aggregate_states, align_aggregate_states); createAggregateStates(aggregate_data); emplace_result.setMapped(aggregate_data); + places[i] = aggregate_data; } - else - aggregate_data = emplace_result.getMapped(); } - else - { - auto find_result = state.findKey(data, i, *arena_for_keys); - if (find_result.isFound()) - aggregate_data = find_result.getMapped(); - } - - /// aggregate_date == nullptr means that the new key did not fit in the hash table because of no_more_keys. - - AggregateDataPtr value = aggregate_data ? aggregate_data : overflow_row; - places[i] = value; } for (size_t j = 0; j < params.aggregates_size; ++j) @@ -3005,22 +3004,16 @@ void NO_INLINE Aggregator::mergeStreamsImpl( if (use_cache) { typename Method::State state(key_columns, key_sizes, aggregation_state_cache); - - if (!no_more_keys) - mergeStreamsImplCase(aggregates_pool, state, data, overflow_row, row_begin, row_end, aggregate_columns_data, arena_for_keys); - else - mergeStreamsImplCase(aggregates_pool, state, data, overflow_row, row_begin, row_end, aggregate_columns_data, arena_for_keys); + mergeStreamsImplCase( + aggregates_pool, state, data, no_more_keys, overflow_row, row_begin, row_end, aggregate_columns_data, arena_for_keys); consecutive_keys_cache_stats.update(row_end - row_begin, state.getCacheMissesSinceLastReset()); } else { typename Method::StateNoCache state(key_columns, key_sizes, aggregation_state_cache); - - if (!no_more_keys) - mergeStreamsImplCase(aggregates_pool, state, data, overflow_row, row_begin, row_end, aggregate_columns_data, arena_for_keys); - else - mergeStreamsImplCase(aggregates_pool, state, data, overflow_row, row_begin, row_end, aggregate_columns_data, arena_for_keys); + mergeStreamsImplCase( + aggregates_pool, state, data, no_more_keys, overflow_row, row_begin, row_end, aggregate_columns_data, arena_for_keys); } } diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h index d7bbe5950a0..6c357623003 100644 --- a/src/Interpreters/Aggregator.h +++ b/src/Interpreters/Aggregator.h @@ -1513,11 +1513,12 @@ private: bool final, ThreadPool * thread_pool) const; - template + template void mergeStreamsImplCase( Arena * aggregates_pool, State & state, Table & data, + bool no_more_keys, AggregateDataPtr overflow_row, size_t row_begin, size_t row_end, From 16e01eb93ad449c61417dcaccd570439364b0714 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 11 Mar 2024 18:05:51 +0100 Subject: [PATCH 0225/1081] Fix style --- src/Core/Settings.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Core/Settings.cpp b/src/Core/Settings.cpp index fb456b46d89..8257b94cd9f 100644 --- a/src/Core/Settings.cpp +++ b/src/Core/Settings.cpp @@ -15,6 +15,7 @@ namespace ErrorCodes extern const int THERE_IS_NO_PROFILE; extern const int NO_ELEMENTS_IN_CONFIG; extern const int UNKNOWN_ELEMENT_IN_CONFIG; + extern const int BAD_ARGUMENTS; } IMPLEMENT_SETTINGS_TRAITS(SettingsTraits, LIST_OF_SETTINGS) From a1e5161cee50650a5c4e87ca60e7ed9eb61451b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 11 Mar 2024 19:25:34 +0100 Subject: [PATCH 0226/1081] Disable sanitizers with 02784_parallel_replicas_automatic_decision_join --- .../02784_parallel_replicas_automatic_decision_join.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh b/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh index ef3e6000903..801cd22b79f 100755 --- a/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh +++ b/tests/queries/0_stateless/02784_parallel_replicas_automatic_decision_join.sh @@ -1,5 +1,6 @@ #!/usr/bin/env bash -## Note: The analyzer doesn't support JOIN with parallel replicas yet +# Tags: no-tsan, no-asan, no-msan +# It's not clear why distributed aggregation is much slower with sanitizers (https://github.com/ClickHouse/ClickHouse/issues/60625) CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 55a82047613c607dedb592fed019d04455e8c8e8 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 11 Mar 2024 19:43:30 +0100 Subject: [PATCH 0227/1081] Fix test --- .../0_stateless/03003_compatibility_setting_bad_value.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql b/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql index 9a6f4e7944a..48e98798c51 100644 --- a/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql +++ b/tests/queries/0_stateless/03003_compatibility_setting_bad_value.sql @@ -1,2 +1,2 @@ -select 42 settings compatibility=NULL; -- {clientError BAD_GET} +select 42 settings compatibility=NULL; -- {clientError BAD_ARGUMENTS} From 5a71636411cb358c94e58b7caac18c22104b0e1c Mon Sep 17 00:00:00 2001 From: kssenii Date: Mon, 11 Mar 2024 19:44:52 +0100 Subject: [PATCH 0228/1081] Fxi --- tests/integration/test_disk_types/test.py | 30 +++++++++++++++++++ .../test_endpoint_macro_substitution/test.py | 9 +++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/tests/integration/test_disk_types/test.py b/tests/integration/test_disk_types/test.py index 86579911b3e..5047cdc605e 100644 --- a/tests/integration/test_disk_types/test.py +++ b/tests/integration/test_disk_types/test.py @@ -50,6 +50,36 @@ def test_different_types(cluster): assert ( fields[encrypted_col_ix] == "0" ), f"{fields[name_col_ix]} expected to be non-encrypted!" +def test_different_types(cluster): + node = cluster.instances["node"] + response = TSV.toMat(node.query("SELECT * FROM system.disks FORMAT TSVWithNames")) + + assert len(response) > len(disk_types) # at least one extra line for header + + name_col_ix = response[0].index("name") + type_col_ix = response[0].index("type") + encrypted_col_ix = response[0].index("is_encrypted") + + for fields in response[1:]: # skip header + assert len(fields) >= 7 + expected_disk_type = disk_types.get(fields[name_col_ix], "UNKNOWN") + + if expected_disk_type != "Local": + disk_type = fields[response[0].index("object_storage_type")] + else: + disk_type = fields[type_col_ix] + + assert ( + expected_disk_type == disk_type + ), f"Wrong type ({fields[type_col_ix]}) for disk {fields[name_col_ix]}!" + if "encrypted" in fields[name_col_ix]: + assert ( + fields[encrypted_col_ix] == "1" + ), f"{fields[name_col_ix]} expected to be encrypted!" + else: + assert ( + fields[encrypted_col_ix] == "0" + ), f"{fields[name_col_ix]} expected to be non-encrypted!" def test_select_by_type(cluster): diff --git a/tests/integration/test_endpoint_macro_substitution/test.py b/tests/integration/test_endpoint_macro_substitution/test.py index bec3d9de0e3..e161d8e82ff 100644 --- a/tests/integration/test_endpoint_macro_substitution/test.py +++ b/tests/integration/test_endpoint_macro_substitution/test.py @@ -45,8 +45,15 @@ def test_different_types(cluster): for fields in response[1:]: # skip header assert len(fields) >= 7 + expected_disk_type = disk_types.get(fields[name_col_ix], "UNKNOWN") + + if expected_disk_type != "Local": + disk_type = fields[response[0].index("object_storage_type")] + else: + disk_type = fields[type_col_ix] + assert ( - disk_types.get(fields[name_col_ix], "UNKNOWN") == fields[type_col_ix] + expected_disk_type == disk_type ), f"Wrong type ({fields[type_col_ix]}) for disk {fields[name_col_ix]}!" if "encrypted" in fields[name_col_ix]: assert ( From 9b055c3a43039387b42e755efddd83b9a8934ca6 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Mon, 11 Mar 2024 20:38:30 +0100 Subject: [PATCH 0229/1081] Use assert_cast to prevent nullptr dereference on bad column types in FunctionsConversion --- src/Functions/FunctionsConversion.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h index 1522e76893e..f338af28240 100644 --- a/src/Functions/FunctionsConversion.h +++ b/src/Functions/FunctionsConversion.h @@ -4561,7 +4561,7 @@ arguments, result_type, input_rows_count); \ if (from_low_cardinality) { - const auto * col_low_cardinality = typeid_cast(arguments[0].column.get()); + const auto * col_low_cardinality = assert_cast(arguments[0].column.get()); if (skip_not_null_check && col_low_cardinality->containsNull()) throw Exception(ErrorCodes::CANNOT_INSERT_NULL_IN_ORDINARY_COLUMN, "Cannot convert NULL value to non-Nullable type"); @@ -4586,7 +4586,7 @@ arguments, result_type, input_rows_count); \ if (to_low_cardinality) { auto res_column = to_low_cardinality->createColumn(); - auto * col_low_cardinality = typeid_cast(res_column.get()); + auto * col_low_cardinality = assert_cast(res_column.get()); if (from_low_cardinality && !src_converted_to_full_column) { From 3a26b9c89ee3083884fde341c2af418bcde2f4cf Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Mon, 11 Mar 2024 19:42:25 +0000 Subject: [PATCH 0230/1081] impl --- .../0_stateless/02887_mutations_subcolumns.reference | 6 +++--- tests/queries/0_stateless/02887_mutations_subcolumns.sql | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/queries/0_stateless/02887_mutations_subcolumns.reference b/tests/queries/0_stateless/02887_mutations_subcolumns.reference index c2d6cbbd225..1ccc83b48a3 100644 --- a/tests/queries/0_stateless/02887_mutations_subcolumns.reference +++ b/tests/queries/0_stateless/02887_mutations_subcolumns.reference @@ -5,6 +5,6 @@ 4 ttt 5 ttt 6 ttt -{"a":"1","obj":{"k1":1,"k2":null,"k3":null}} -{"a":"3","obj":{"k1":null,"k2":null,"k3":1}} -{"a":"1","obj":{"k1":1,"k2":null,"k3":null}} +1 [('k1',1)] +3 [('k3',1)] +1 [('k1',1)] diff --git a/tests/queries/0_stateless/02887_mutations_subcolumns.sql b/tests/queries/0_stateless/02887_mutations_subcolumns.sql index a01158e1b06..87b3009e929 100644 --- a/tests/queries/0_stateless/02887_mutations_subcolumns.sql +++ b/tests/queries/0_stateless/02887_mutations_subcolumns.sql @@ -40,9 +40,9 @@ INSERT INTO t_mutations_subcolumns VALUES (2, '{"k2": 1}'); INSERT INTO t_mutations_subcolumns VALUES (3, '{"k3": 1}'); ALTER TABLE t_mutations_subcolumns DELETE WHERE obj.k2 = 1; -SELECT * FROM t_mutations_subcolumns ORDER BY a FORMAT JSONEachRow; +SELECT a, arrayFilter(x -> not isNull(x.2), tupleToNameValuePairs(obj)) FROM t_mutations_subcolumns ORDER BY a; ALTER TABLE t_mutations_subcolumns DELETE WHERE isNull(obj.k1); -SELECT * FROM t_mutations_subcolumns ORDER BY a FORMAT JSONEachRow; +SELECT a, arrayFilter(x -> not isNull(x.2), tupleToNameValuePairs(obj)) FROM t_mutations_subcolumns ORDER BY a; DROP TABLE t_mutations_subcolumns; From 2e74685ba6ea8a3cc32ff0e21d0ee657517ef5a4 Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 11 Mar 2024 19:58:43 +0000 Subject: [PATCH 0231/1081] Make variant tests a bit faster --- .../02941_variant_type_2.reference | 80 +++++++++---------- .../0_stateless/02941_variant_type_2.sh | 12 +-- ...different_local_and_global_order.reference | 30 +++---- ...e_with_different_local_and_global_order.sh | 8 +- 4 files changed, 65 insertions(+), 65 deletions(-) diff --git a/tests/queries/0_stateless/02941_variant_type_2.reference b/tests/queries/0_stateless/02941_variant_type_2.reference index 4b6d53c52ac..20a5176cb5e 100644 --- a/tests/queries/0_stateless/02941_variant_type_2.reference +++ b/tests/queries/0_stateless/02941_variant_type_2.reference @@ -1,51 +1,51 @@ Memory test4 insert test4 select -1000000 -200000 -200000 -200000 -200000 -200000 -200000 -200000 +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 MergeTree compact test4 insert test4 select -1000000 -200000 -200000 -200000 -200000 -200000 -200000 -200000 +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 test4 select -1000000 -200000 -200000 -200000 -200000 -200000 -200000 -200000 +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 MergeTree wide test4 insert test4 select -1000000 -200000 -200000 -200000 -200000 -200000 -200000 -200000 +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 test4 select -1000000 -200000 -200000 -200000 -200000 -200000 -200000 -200000 +500000 +100000 +100000 +100000 +100000 +100000 +100000 +100000 diff --git a/tests/queries/0_stateless/02941_variant_type_2.sh b/tests/queries/0_stateless/02941_variant_type_2.sh index 509c537e7fc..d1fa0a777c9 100755 --- a/tests/queries/0_stateless/02941_variant_type_2.sh +++ b/tests/queries/0_stateless/02941_variant_type_2.sh @@ -12,12 +12,12 @@ CH_CLIENT="$CLICKHOUSE_CLIENT --allow_experimental_variant_type=1 --allow_suspic function test4_insert() { echo "test4 insert" - $CH_CLIENT -nmq "insert into test select number, NULL from numbers(200000); -insert into test select number + 200000, number from numbers(200000); -insert into test select number + 400000, 'str_' || toString(number) from numbers(200000); -insert into test select number + 600000, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(200000); -insert into test select number + 800000, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(200000); -insert into test select number + 1000000, range(number % 20 + 1)::Array(UInt64) from numbers(200000);" + $CH_CLIENT -nmq "insert into test select number, NULL from numbers(100000); +insert into test select number + 100000, number from numbers(100000); +insert into test select number + 200000, 'str_' || toString(number) from numbers(100000); +insert into test select number + 300000, ('lc_str_' || toString(number))::LowCardinality(String) from numbers(100000); +insert into test select number + 400000, tuple(number, number + 1)::Tuple(a UInt32, b UInt32) from numbers(100000); +insert into test select number + 500000, range(number % 20 + 1)::Array(UInt64) from numbers(100000);" } function test4_select diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference index 1736a307c42..4109a88997c 100644 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.reference @@ -44,9 +44,9 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -2500000 -750000 -1750000 +500000 +100000 +400000 ----------------------------------------------------------------------------------------------------------- MergeTree compact test1 insert @@ -136,14 +136,14 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -2500000 -750000 -1750000 +500000 +100000 +400000 ----------------------------------------------------------------------------------------------------------- test2 select -2500000 -750000 -1750000 +500000 +100000 +400000 ----------------------------------------------------------------------------------------------------------- MergeTree wide test1 insert @@ -233,12 +233,12 @@ str_38 str_38 \N ----------------------------------------------------------------------------------------------------------- test2 insert test2 select -2500000 -750000 -1750000 +500000 +100000 +400000 ----------------------------------------------------------------------------------------------------------- test2 select -2500000 -750000 -1750000 +500000 +100000 +400000 ----------------------------------------------------------------------------------------------------------- diff --git a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh index 3bb37719a3f..1d88757a5d6 100755 --- a/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh +++ b/tests/queries/0_stateless/02943_variant_type_with_different_local_and_global_order.sh @@ -29,10 +29,10 @@ function test1_select() function test2_insert() { echo "test2 insert" - $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(1000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(2000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" - $CH_CLIENT -q "insert into test select number, if(number < 3500000, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(3000000, 1000000) settings max_insert_block_size = 100000, min_insert_block_size_rows=100000" + $CH_CLIENT -q "insert into test select number, number::Variant(UInt64)::Variant(UInt64, Array(UInt64)) from numbers(200000) settings max_insert_block_size = 10000, min_insert_block_size_rows=10000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)) as res from numbers(200000, 200000) settings max_insert_block_size = 10000, min_insert_block_size_rows=10000" + $CH_CLIENT -q "insert into test select number, if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64)) as res from numbers(400000, 200000) settings max_insert_block_size = 10000, min_insert_block_size_rows=10000" + $CH_CLIENT -q "insert into test select number, if(number < 3500000, if(number % 2, NULL, number)::Variant(UInt64)::Variant(UInt64, String, Array(UInt64)), if(number % 2, NULL, 'str_' || toString(number))::Variant(String)::Variant(UInt64, String, Array(UInt64))) from numbers(600000, 200000) settings max_insert_block_size = 10000, min_insert_block_size_rows=10000" } function test2_select() From 120a1fdb5f817b442bf659da243407fb7003eaa1 Mon Sep 17 00:00:00 2001 From: johnnymatthews <9611008+johnnymatthews@users.noreply.github.com> Date: Mon, 11 Mar 2024 17:24:33 -0300 Subject: [PATCH 0232/1081] Improves varPop docs. Adds varPopStable. --- .../aggregate-functions/reference/varpop.md | 99 +++++++++++++++++-- 1 file changed, 91 insertions(+), 8 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/varpop.md b/docs/en/sql-reference/aggregate-functions/reference/varpop.md index 751688b0830..5f18bdc30f6 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varpop.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varpop.md @@ -1,16 +1,99 @@ --- -slug: /en/sql-reference/aggregate-functions/reference/varpop +title: "varPop" +slug: "/en/sql-reference/aggregate-functions/reference/varpop" sidebar_position: 32 --- -# varPop(x) +This page covers the `varPop` and `varPopStable` functions available in ClickHouse. -Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`. +## varPop -In other words, dispersion for a set of values. Returns `Float64`. +Calculates the population covariance between two data columns. The population covariance measures the degree to which two variables vary together. Calculates the amount `Σ((x - x̅)^2) / n`, where `n` is the sample size and `x̅`is the average value of `x`. -Alias: `VAR_POP`. +**Syntax** -:::note -This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varPopStable` function. It works slower but provides a lower computational error. -::: \ No newline at end of file +```sql +covarPop(x, y) +``` + +**Parameters** + +- `x`: The first data column. [Numeric](../../../native-protocol/columns.md) +- `y`: The second data column. [Numeric](../../../native-protocol/columns.md) + +**Returned value** + +Returns an integer of type `Float64`. + +**Implementation details** + +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varPopStable` function](#varPopStable). + +**Example** + +```sql +DROP TABLE IF EXISTS test_data; +CREATE TABLE test_data +( + x Int32, + y Int32 +) +ENGINE = Memory; + +INSERT INTO test_data VALUES (1, 2), (2, 3), (3, 5), (4, 6), (5, 8); + +SELECT + covarPop(x, y) AS covar_pop +FROM test_data; +``` + +```response +3 +``` + +## varPopStable + +Calculates population covariance between two data columns using a stable, numerically accurate method to calculate the variance. This function is designed to provide reliable results even with large datasets or values that might cause numerical instability in other implementations. + +**Syntax** + +```sql +covarPopStable(x, y) +``` + +**Parameters** + +- `x`: The first data column. [String literal](../syntax#syntax-string-literal) +- `y`: The second data column. [Expression](../syntax#syntax-expressions) + +**Returned value** + +Returns an integer of type `Float64`. + +**Implementation details** + +Unlike [`varPop()`](#varPop), this function uses a stable, numerically accurate algorithm to calculate the population variance to avoid issues like catastrophic cancellation or loss of precision. This function also handles `NaN` and `Inf` values correctly, excluding them from calculations. + +**Example** + +Query: + +```sql +DROP TABLE IF EXISTS test_data; +CREATE TABLE test_data +( + x Int32, + y Int32 +) +ENGINE = Memory; + +INSERT INTO test_data VALUES (1, 2), (2, 9), (9, 5), (4, 6), (5, 8); + +SELECT + covarPopStable(x, y) AS covar_pop_stable +FROM test_data; +``` + +```response +0.5999999999999999 +``` From 281dc8d29deba2980e6b191edefa3b62114d38a7 Mon Sep 17 00:00:00 2001 From: johnnymatthews <9611008+johnnymatthews@users.noreply.github.com> Date: Mon, 11 Mar 2024 17:48:12 -0300 Subject: [PATCH 0233/1081] Improves varSamp docs. Adds varSampStable docs. --- .../aggregate-functions/reference/varsamp.md | 126 ++++++++++++++++-- 1 file changed, 118 insertions(+), 8 deletions(-) diff --git a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md index 9b2b94936ec..e75cb075ff8 100644 --- a/docs/en/sql-reference/aggregate-functions/reference/varsamp.md +++ b/docs/en/sql-reference/aggregate-functions/reference/varsamp.md @@ -1,18 +1,128 @@ --- +title: "varSamp" slug: /en/sql-reference/aggregate-functions/reference/varsamp sidebar_position: 33 --- -# varSamp +This page contains information on the `varSamp` and `varSampStable` ClickHouse functions. -Calculates the amount `Σ((x - x̅)^2) / (n - 1)`, where `n` is the sample size and `x̅`is the average value of `x`. +## varSamp -It represents an unbiased estimate of the variance of a random variable if passed values from its sample. +Calculate the sample variance of a data set. -Returns `Float64`. When `n <= 1`, returns `+∞`. +**Syntax** -Alias: `VAR_SAMP`. +```sql +varSamp(expr) +``` -:::note -This function uses a numerically unstable algorithm. If you need [numerical stability](https://en.wikipedia.org/wiki/Numerical_stability) in calculations, use the `varSampStable` function. It works slower but provides a lower computational error. -::: +**Parameters** + +- `expr`: An expression representing the data set for which you want to calculate the sample variance. [Expression](../syntax#syntax-expressions) + +**Returned value** + +Returns a Float64 value representing the sample variance of the input data set. + +**Implementation details** + +The `varSamp()` function calculates the sample variance using the following formula: + +```plaintext +∑(x - mean(x))^2 / (n - 1) +``` + +Where: + +- `x` is each individual data point in the data set. +- `mean(x)` is the arithmetic mean of the data set. +- `n` is the number of data points in the data set. + +The function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPop()` function](./varpop#varpop) instead. + +This function uses a numerically unstable algorithm. If you need numerical stability in calculations, use the slower but more stable [`varSampStable` function](#varSampStable). + +**Example** + +Query: + +```sql +CREATE TABLE example_table +( + id UInt64, + value Float64 +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO example_table VALUES (1, 10.5), (2, 12.3), (3, 9.8), (4, 11.2), (5, 10.7); + +SELECT varSamp(value) FROM example_table; +``` + +Response: + +```response +0.8650000000000091 +``` + +## varSampStable + +Calculate the sample variance of a data set using a numerically stable algorithm. + +**Syntax** + +```sql +varSampStable(expr) +``` + +**Parameters** + +- `expr`: An expression representing the data set for which you want to calculate the sample variance. [Expression](../syntax#syntax-expressions) + +**Returned value** + +The `varSampStable()` function returns a Float64 value representing the sample variance of the input data set. + +**Implementation details** + +The `varSampStable()` function calculates the sample variance using the same formula as the [`varSamp()`](#varSamp function): + +```plaintext +∑(x - mean(x))^2 / (n - 1) +``` + +Where: +- `x` is each individual data point in the data set. +- `mean(x)` is the arithmetic mean of the data set. +- `n` is the number of data points in the data set. + +The difference between `varSampStable()` and `varSamp()` is that `varSampStable()` is designed to provide a more deterministic and stable result when dealing with floating-point arithmetic. It uses an algorithm that minimizes the accumulation of rounding errors, which can be particularly important when dealing with large data sets or data with a wide range of values. + +Like `varSamp()`, the `varSampStable()` function assumes that the input data set represents a sample from a larger population. If you want to calculate the variance of the entire population (when you have the complete data set), you should use the [`varPopStable()` function](./varpop#varpopstable) instead. + +**Example** + +Query: + +```sql +CREATE TABLE example_table +( + id UInt64, + value Float64 +) +ENGINE = MergeTree +ORDER BY id; + +INSERT INTO example_table VALUES (1, 10.5), (2, 12.3), (3, 9.8), (4, 11.2), (5, 10.7); + +SELECT varSampStable(value) FROM example_table; +``` + +Response: + +```response +0.865 +``` + +This query calculates the sample variance of the `value` column in the `example_table` using the `varSampStable()` function. The result shows that the sample variance of the values `[10.5, 12.3, 9.8, 11.2, 10.7]` is approximately 0.865, which may differ slightly from the result of `varSamp()` due to the more precise handling of floating-point arithmetic. From 563df9bdcb425810a0c2d3ecb11302e22039c048 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Mon, 11 Mar 2024 22:49:18 +0100 Subject: [PATCH 0234/1081] Fix multiple bugs in groupArraySorted --- .../AggregateFunctionGroupArraySorted.cpp | 8 +++++--- .../0_stateless/03008_groupSortedArray_field.reference | 3 +++ .../queries/0_stateless/03008_groupSortedArray_field.sql | 6 ++++++ 3 files changed, 14 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/03008_groupSortedArray_field.reference create mode 100644 tests/queries/0_stateless/03008_groupSortedArray_field.sql diff --git a/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp b/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp index 0e9856cfab9..0692ff28f18 100644 --- a/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp +++ b/src/AggregateFunctions/AggregateFunctionGroupArraySorted.cpp @@ -310,10 +310,12 @@ public: { for (Field & element : values) { - UInt8 is_null = 0; - readBinary(is_null, buf); - if (!is_null) + bool has_value = 0; + readBinary(has_value, buf); + if (has_value) serialization->deserializeBinary(element, buf, {}); + else + element = Field{}; } } else diff --git a/tests/queries/0_stateless/03008_groupSortedArray_field.reference b/tests/queries/0_stateless/03008_groupSortedArray_field.reference new file mode 100644 index 00000000000..a7f89ebcf58 --- /dev/null +++ b/tests/queries/0_stateless/03008_groupSortedArray_field.reference @@ -0,0 +1,3 @@ +0A01003C79A557B3C43400C4865AA84C3B4B01000650BC18F7DE0B00FAAF43E708213401008ED706EA0A9F13007228F915F5602C0100C692CA8FB81405003A6D357047EB1A01008416B7C3239EE3FF7BE9483CDC61DC01003E133A7C081AF5FFC1ECC583F7E5EA01000000000000000000000000000000000100C4865AA84C3BCBFF3B79A557B3C4B4010024C46EF500F1ECFFDB3B910AFF0ED301005E2FC14EBAEAE5FFA1D03EB14515DA +070109000000010600000001080000000103000000010500000001040000000107000000 AggregateFunction(groupArraySorted(10), Nullable(Decimal(3, 0))) +[3,4,5,6,7,8,9] diff --git a/tests/queries/0_stateless/03008_groupSortedArray_field.sql b/tests/queries/0_stateless/03008_groupSortedArray_field.sql new file mode 100644 index 00000000000..6d2aea641a5 --- /dev/null +++ b/tests/queries/0_stateless/03008_groupSortedArray_field.sql @@ -0,0 +1,6 @@ +-- https://github.com/ClickHouse/ClickHouse/issues/61186 +SELECT hex(CAST(unhex('0A01003C79A557B3C43400C4865AA84C3B4B01000650BC18F7DE0B00FAAF43E708213401008ED706EA0A9F13007228F915F5602C0100C692CA8FB81405003A6D357047EB1A01008416B7C3239EE3FF7BE9483CDC61DC01003E133A7C081AF5FFC1ECC583F7E5EA01000000000000000000000000000000000100C4865AA84C3BCBFF3B79A557B3C4B4010024C46EF500F1ECFFDB3B910AFF0ED301005E2FC14EBAEAE5FFA1D03EB14515DA'), + 'AggregateFunction(groupArraySorted(10), Decimal(38, 38))')); + +Select hex(groupArraySortedState(10)((number < 3 ? NULL : number)::Nullable(Decimal(3))) as t), toTypeName(t) from numbers(10); +Select finalizeAggregation(unhex('070109000000010600000001080000000103000000010500000001040000000107000000')::AggregateFunction(groupArraySorted(10), Nullable(Decimal(3, 0)))); From 1b04cc0b4da6d32fd4741ea953dfed060f846d0b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 12 Mar 2024 03:56:10 +0100 Subject: [PATCH 0235/1081] Fix strange log message --- src/Loggers/Loggers.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/Loggers/Loggers.cpp b/src/Loggers/Loggers.cpp index 1d17585cc96..cc6e4691737 100644 --- a/src/Loggers/Loggers.cpp +++ b/src/Loggers/Loggers.cpp @@ -304,6 +304,9 @@ void Loggers::buildLoggers(Poco::Util::AbstractConfiguration & config, Poco::Log log_settings.turn_off_logger = DB::TextLog::shouldTurnOffLogger(); + log_settings.database = config.getString("text_log.database", "system"); + log_settings.table = config.getString("text_log.table", "text_log"); + split->addTextLog(DB::TextLog::getLogQueue(log_settings), text_log_level); } #endif From 29fce4143d1f177efdf1864d41429cfadea22ff1 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 12 Mar 2024 12:07:24 +0800 Subject: [PATCH 0236/1081] [fix] log level from fatal->error when hardlink and copy both fail --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index c76ffeee874..97968f1b9c1 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7174,7 +7174,7 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - LOG_FATAL(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail"); + LOG_ERROR(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail"); } From c628eaca8ba19584fe36067dee8e6ec3e8f5cc4b Mon Sep 17 00:00:00 2001 From: Zhuo Qiu Date: Tue, 26 Dec 2023 14:13:07 +0800 Subject: [PATCH 0237/1081] Consider deleted rows when selecting parts to merge --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 90 +++++++++++++++++++ src/Storages/MergeTree/IMergeTreeDataPart.h | 13 +++ .../MergeTree/MergeFromLogEntryTask.cpp | 2 +- src/Storages/MergeTree/MergeTreeData.cpp | 1 + .../MergeTree/MergeTreeDataMergerMutator.cpp | 11 ++- .../MergeTree/MergeTreeDataMergerMutator.h | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 1 + src/Storages/MergeTree/MergeTreeSettings.h | 2 + .../MergeTree/MergedBlockOutputStream.cpp | 5 ++ .../MergeTree/MutateFromLogEntryTask.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 54 +++++++++++ .../MergeTree/ReplicatedMergeTreeQueue.cpp | 5 +- src/Storages/StorageMergeTree.cpp | 4 +- .../03001_consider_lwd_when_merge.reference | 3 + .../03001_consider_lwd_when_merge.sql | 23 +++++ 15 files changed, 208 insertions(+), 10 deletions(-) create mode 100644 tests/queries/0_stateless/03001_consider_lwd_when_merge.reference create mode 100644 tests/queries/0_stateless/03001_consider_lwd_when_merge.sql diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index 3fea6d04944..c099512d636 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -609,6 +609,15 @@ UInt64 IMergeTreeDataPart::getMarksCount() const return index_granularity.getMarksCount(); } +UInt64 IMergeTreeDataPart::getExistingBytesOnDisk() const +{ + if (storage.getSettings()->exclude_deleted_rows_for_part_size_in_merge && supportLightweightDeleteMutate() && hasLightweightDelete() + && existing_rows_count.has_value() && existing_rows_count.value() < rows_count && rows_count > 0) + return bytes_on_disk * existing_rows_count.value() / rows_count; + else + return bytes_on_disk; +} + size_t IMergeTreeDataPart::getFileSizeOrZero(const String & file_name) const { auto checksum = checksums.files.find(file_name); @@ -691,6 +700,7 @@ void IMergeTreeDataPart::loadColumnsChecksumsIndexes(bool require_columns_checks calculateColumnsAndSecondaryIndicesSizesOnDisk(); loadRowsCount(); /// Must be called after loadIndexGranularity() as it uses the value of `index_granularity`. + loadExistingRowsCount(); /// Must be called after loadRowsCount() as it uses the value of `rows_count`. loadPartitionAndMinMaxIndex(); if (!parent_part) { @@ -1313,6 +1323,86 @@ void IMergeTreeDataPart::loadRowsCount() } } +void IMergeTreeDataPart::loadExistingRowsCount() +{ + if (existing_rows_count.has_value()) + return; + + if (!rows_count || !storage.getSettings()->load_existing_rows_count_for_old_parts || !supportLightweightDeleteMutate() + || !hasLightweightDelete()) + existing_rows_count = rows_count; + else + existing_rows_count = readExistingRowsCount(); +} + +UInt64 IMergeTreeDataPart::readExistingRowsCount() +{ + const size_t total_mark = getMarksCount(); + if (!total_mark) + return rows_count; + + NamesAndTypesList cols; + cols.push_back(LightweightDeleteDescription::FILTER_COLUMN); + + StorageMetadataPtr metadata_ptr = storage.getInMemoryMetadataPtr(); + StorageSnapshotPtr storage_snapshot_ptr = std::make_shared(storage, metadata_ptr); + + MergeTreeReaderPtr reader = getReader( + cols, + storage_snapshot_ptr, + MarkRanges{MarkRange(0, total_mark)}, + nullptr, + storage.getContext()->getMarkCache().get(), + std::make_shared(), + MergeTreeReaderSettings{}, + ValueSizeMap{}, + ReadBufferFromFileBase::ProfileCallback{}); + + if (!reader) + { + LOG_WARNING(storage.log, "Create reader failed while reading existing rows count"); + return rows_count; + } + + size_t current_mark = 0; + bool continue_reading = false; + size_t current_row = 0; + size_t existing_count = 0; + + while (current_row < rows_count) + { + size_t rows_to_read = index_granularity.getMarkRows(current_mark); + continue_reading = (current_mark != 0); + + Columns result; + result.resize(1); + + size_t rows_read = reader->readRows(current_mark, total_mark, continue_reading, rows_to_read, result); + if (!rows_read) + { + LOG_WARNING(storage.log, "Part {} has lightweight delete, but _row_exists column not found", name); + return rows_count; + } + + current_row += rows_read; + current_mark += (rows_to_read == rows_read); + + const ColumnUInt8 * row_exists_col = typeid_cast(result[0].get()); + if (!row_exists_col) + { + LOG_WARNING(storage.log, "Part {} _row_exists column type is not UInt8", name); + return rows_count; + } + + for (UInt8 row_exists : row_exists_col->getData()) + if (row_exists) + existing_count++; + } + + LOG_DEBUG(storage.log, "Part {} existing_rows_count = {}", name, existing_count); + return existing_count; +} + void IMergeTreeDataPart::appendFilesOfRowsCount(Strings & files) { files.push_back("count.txt"); diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.h b/src/Storages/MergeTree/IMergeTreeDataPart.h index aaae64a5970..8bd32e777bc 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.h +++ b/src/Storages/MergeTree/IMergeTreeDataPart.h @@ -231,6 +231,9 @@ public: size_t rows_count = 0; + /// Existing rows count (excluding lightweight deleted rows) + std::optional existing_rows_count; + time_t modification_time = 0; /// When the part is removed from the working set. Changes once. mutable std::atomic remove_time { std::numeric_limits::max() }; @@ -373,6 +376,10 @@ public: void setBytesOnDisk(UInt64 bytes_on_disk_) { bytes_on_disk = bytes_on_disk_; } void setBytesUncompressedOnDisk(UInt64 bytes_uncompressed_on_disk_) { bytes_uncompressed_on_disk = bytes_uncompressed_on_disk_; } + /// Returns estimated size of existing rows if setting exclude_deleted_rows_for_part_size_in_merge is true + /// Otherwise returns bytes_on_disk + UInt64 getExistingBytesOnDisk() const; + size_t getFileSizeOrZero(const String & file_name) const; auto getFilesChecksums() const { return checksums.files; } @@ -499,6 +506,9 @@ public: /// True if here is lightweight deleted mask file in part. bool hasLightweightDelete() const; + /// Read existing rows count from _row_exists column + UInt64 readExistingRowsCount(); + void writeChecksums(const MergeTreeDataPartChecksums & checksums_, const WriteSettings & settings); /// Checks the consistency of this data part. @@ -664,6 +674,9 @@ private: /// For the older format version calculates rows count from the size of a column with a fixed size. void loadRowsCount(); + /// Load existing rows count from _row_exists column if load_existing_rows_count_for_old_parts is true. + void loadExistingRowsCount(); + static void appendFilesOfRowsCount(Strings & files); /// Loads ttl infos in json format from file ttl.txt. If file doesn't exists assigns ttl infos with all zeros diff --git a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp index ae6e398026d..5ef004ec019 100644 --- a/src/Storages/MergeTree/MergeFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MergeFromLogEntryTask.cpp @@ -174,7 +174,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MergeFromLogEntryTask::prepare() } /// Start to make the main work - size_t estimated_space_for_merge = MergeTreeDataMergerMutator::estimateNeededDiskSpace(parts); + size_t estimated_space_for_merge = MergeTreeDataMergerMutator::estimateNeededDiskSpace(parts, true); /// Can throw an exception while reserving space. IMergeTreeDataPart::TTLInfos ttl_infos; diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index d56cf761cf4..5e05f75c1c5 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -8261,6 +8261,7 @@ std::pair MergeTreeData::createE new_data_part->setColumns(columns, {}, metadata_snapshot->getMetadataVersion()); new_data_part->rows_count = block.rows(); + new_data_part->existing_rows_count = block.rows(); new_data_part->partition = partition; diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 1bf1d4a3c29..90144a8cc8f 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -405,7 +405,7 @@ MergeTreeDataMergerMutator::MergeSelectingInfo MergeTreeDataMergerMutator::getPo } IMergeSelector::Part part_info; - part_info.size = part->getBytesOnDisk(); + part_info.size = part->getExistingBytesOnDisk(); part_info.age = res.current_time - part->modification_time; part_info.level = part->info.level; part_info.data = ∂ @@ -611,7 +611,7 @@ SelectPartsDecision MergeTreeDataMergerMutator::selectAllPartsToMergeWithinParti return SelectPartsDecision::CANNOT_SELECT; } - sum_bytes += (*it)->getBytesOnDisk(); + sum_bytes += (*it)->getExistingBytesOnDisk(); prev_it = it; ++it; @@ -793,7 +793,7 @@ MergeTreeData::DataPartPtr MergeTreeDataMergerMutator::renameMergedTemporaryPart } -size_t MergeTreeDataMergerMutator::estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts) +size_t MergeTreeDataMergerMutator::estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts, const bool & is_merge) { size_t res = 0; time_t current_time = std::time(nullptr); @@ -804,7 +804,10 @@ size_t MergeTreeDataMergerMutator::estimateNeededDiskSpace(const MergeTreeData:: if (part_max_ttl && part_max_ttl <= current_time) continue; - res += part->getBytesOnDisk(); + if (is_merge) + res += part->getExistingBytesOnDisk(); + else + res += part->getBytesOnDisk(); } return static_cast(res * DISK_USAGE_COEFFICIENT_TO_RESERVE); diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h index f3a3f51b6c3..731c5e1d176 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h @@ -193,7 +193,7 @@ public: /// The approximate amount of disk space needed for merge or mutation. With a surplus. - static size_t estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts); + static size_t estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts, const bool & is_merge); private: /** Select all parts belonging to the same partition. diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index fdac16ae19a..2ba74e44b40 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -537,6 +537,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( new_data_part->setColumns(columns, infos, metadata_snapshot->getMetadataVersion()); new_data_part->rows_count = block.rows(); + new_data_part->existing_rows_count = block.rows(); new_data_part->partition = std::move(partition); new_data_part->minmax_idx = std::move(minmax_idx); new_data_part->is_temp = true; diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h index 925dc973dc3..ea54f61b4b6 100644 --- a/src/Storages/MergeTree/MergeTreeSettings.h +++ b/src/Storages/MergeTree/MergeTreeSettings.h @@ -42,6 +42,7 @@ struct Settings; M(UInt64, compact_parts_max_bytes_to_buffer, 128 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ M(UInt64, compact_parts_max_granules_to_buffer, 128, "Only available in ClickHouse Cloud", 0) \ M(UInt64, compact_parts_merge_max_bytes_to_prefetch_part, 16 * 1024 * 1024, "Only available in ClickHouse Cloud", 0) \ + M(Bool, load_existing_rows_count_for_old_parts, false, "Whether to load existing_rows_count for existing parts. If false, existing_rows_count will be equal to rows_count for existing parts.", 0) \ \ /** Merge settings. */ \ M(UInt64, merge_max_block_size, 8192, "How many rows in blocks should be formed for merge operations. By default has the same value as `index_granularity`.", 0) \ @@ -79,6 +80,7 @@ struct Settings; M(UInt64, number_of_mutations_to_throw, 1000, "If table has at least that many unfinished mutations, throw 'Too many mutations' exception. Disabled if set to 0", 0) \ M(UInt64, min_delay_to_mutate_ms, 10, "Min delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ M(UInt64, max_delay_to_mutate_ms, 1000, "Max delay of mutating MergeTree table in milliseconds, if there are a lot of unfinished mutations", 0) \ + M(Bool, exclude_deleted_rows_for_part_size_in_merge, false, "Use an estimated source part size (excluding lightweight deleted rows) when selecting parts to merge", 0) \ \ /** Inserts settings. */ \ M(UInt64, parts_to_delay_insert, 1000, "If table contains at least that many active parts in single partition, artificially slow down insert into table. Disabled if set to 0", 0) \ diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp index f2fe2e0f255..d8555d69788 100644 --- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp +++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp @@ -188,6 +188,11 @@ MergedBlockOutputStream::Finalizer MergedBlockOutputStream::finalizePartAsync( new_part->index_granularity = writer->getIndexGranularity(); new_part->calculateColumnsAndSecondaryIndicesSizesOnDisk(); + /// In mutation, existing_rows_count is already calculated in PartMergerWriter + /// In merge situation, lightweight deleted rows was physically deleted, existing_rows_count equals rows_count + if (!new_part->existing_rows_count.has_value()) + new_part->existing_rows_count = rows_count; + if (default_codec != nullptr) new_part->default_codec = default_codec; diff --git a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp index a9ff687fe4d..620b0e34c6a 100644 --- a/src/Storages/MergeTree/MutateFromLogEntryTask.cpp +++ b/src/Storages/MergeTree/MutateFromLogEntryTask.cpp @@ -49,7 +49,7 @@ ReplicatedMergeMutateTaskBase::PrepareResult MutateFromLogEntryTask::prepare() } /// TODO - some better heuristic? - size_t estimated_space_for_result = MergeTreeDataMergerMutator::estimateNeededDiskSpace({source_part}); + size_t estimated_space_for_result = MergeTreeDataMergerMutator::estimateNeededDiskSpace({source_part}, false); if (entry.create_time + storage_settings_ptr->prefer_fetch_merged_part_time_threshold.totalSeconds() <= time(nullptr) && estimated_space_for_result >= storage_settings_ptr->prefer_fetch_merged_part_size_threshold) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 150cc27c369..3d31d2f05db 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -60,6 +60,26 @@ static bool checkOperationIsNotCanceled(ActionBlocker & merges_blocker, MergeLis return true; } +static UInt64 getExistingRowsCount(const Block & block) +{ + auto column = block.getByName(LightweightDeleteDescription::FILTER_COLUMN.name).column; + const ColumnUInt8 * row_exists_col = typeid_cast(column.get()); + + if (!row_exists_col) + { + LOG_WARNING(&Poco::Logger::get("MutationHelpers::getExistingRowsCount"), "_row_exists column type is not UInt8"); + return block.rows(); + } + + UInt64 existing_count = 0; + + for (UInt8 row_exists : row_exists_col->getData()) + if (row_exists) + existing_count++; + + return existing_count; +} + /** Split mutation commands into two parts: * First part should be executed by mutations interpreter. * Other is just simple drop/renames, so they can be executed without interpreter. @@ -997,6 +1017,9 @@ struct MutationContext bool need_prefix = true; scope_guard temporary_directory_lock; + + /// Whether this mutation contains lightweight delete + bool has_lightweight_delete; }; using MutationContextPtr = std::shared_ptr; @@ -1191,6 +1214,7 @@ public: } case State::SUCCESS: { + finalize(); return false; } } @@ -1226,6 +1250,11 @@ private: const ProjectionsDescription & projections; ExecutableTaskPtr merge_projection_parts_task_ptr; + + /// Existing rows count calculated during part writing. + /// It is initialized in prepare(), calculated in mutateOriginalPartAndPrepareProjections() + /// and set to new_data_part in finalize() + size_t existing_rows_count; }; @@ -1238,6 +1267,8 @@ void PartMergerWriter::prepare() // We split the materialization into multiple stages similar to the process of INSERT SELECT query. projection_squashes.emplace_back(settings.min_insert_block_size_rows, settings.min_insert_block_size_bytes); } + + existing_rows_count = 0; } @@ -1251,6 +1282,9 @@ bool PartMergerWriter::mutateOriginalPartAndPrepareProjections() ctx->out->write(cur_block); + if (ctx->has_lightweight_delete) + existing_rows_count += MutationHelpers::getExistingRowsCount(cur_block); + for (size_t i = 0, size = ctx->projections_to_build.size(); i < size; ++i) { const auto & projection = *ctx->projections_to_build[i]; @@ -1340,6 +1374,12 @@ bool PartMergerWriter::iterateThroughAllProjections() return true; } +void PartMergerWriter::finalize() +{ + if (ctx->has_lightweight_delete) + ctx->new_data_part->existing_rows_count = existing_rows_count; +} + class MutateAllPartColumnsTask : public IExecutableTask { public: @@ -2185,6 +2225,20 @@ bool MutateTask::prepare() if (ctx->mutating_pipeline_builder.initialized()) ctx->execute_ttl_type = MutationHelpers::shouldExecuteTTL(ctx->metadata_snapshot, ctx->interpreter->getColumnDependencies()); + if (ctx->updated_header.has(LightweightDeleteDescription::FILTER_COLUMN.name)) + { + /// This mutation contains lightweight delete, reset existing_rows_count of new data part to 0 + /// It will be updated while writing _row_exists column + ctx->has_lightweight_delete = true; + } + else + { + ctx->has_lightweight_delete = false; + + /// This mutation does not contains lightweight delete, copy existing_rows_count from source part + ctx->new_data_part->existing_rows_count = ctx->source_part->existing_rows_count.value_or(ctx->source_part->rows_count); + } + /// All columns from part are changed and may be some more that were missing before in part /// TODO We can materialize compact part without copying data if (!isWidePart(ctx->source_part) || !isFullPartStorage(ctx->source_part->getDataPartStorage()) diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp index 858eae4afd9..42f564f40da 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp +++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp @@ -1350,7 +1350,10 @@ bool ReplicatedMergeTreeQueue::shouldExecuteLogEntry( auto part = data.getPartIfExists(name, {MergeTreeDataPartState::PreActive, MergeTreeDataPartState::Active, MergeTreeDataPartState::Outdated}); if (part) { - sum_parts_size_in_bytes += part->getBytesOnDisk(); + if (entry.type == LogEntry::MERGE_PARTS) + sum_parts_size_in_bytes += part->getExistingBytesOnDisk(); + else + sum_parts_size_in_bytes += part->getBytesOnDisk(); if (entry.type == LogEntry::MUTATE_PART && !storage.mutation_backoff_policy.partCanBeMutated(part->name)) { diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 663e7f435b7..c816a6f0dce 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -1113,7 +1113,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMerge( if (isTTLMergeType(future_part->merge_type)) getContext()->getMergeList().bookMergeWithTTL(); - merging_tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace(future_part->parts), *this, metadata_snapshot, false); + merging_tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace(future_part->parts, true), *this, metadata_snapshot, false); return std::make_shared(future_part, std::move(merging_tagger), std::make_shared()); } @@ -1336,7 +1336,7 @@ MergeMutateSelectedEntryPtr StorageMergeTree::selectPartsToMutate( future_part->name = part->getNewName(new_part_info); future_part->part_format = part->getFormat(); - tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true); + tagger = std::make_unique(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}, false), *this, metadata_snapshot, true); return std::make_shared(future_part, std::move(tagger), commands, txn); } } diff --git a/tests/queries/0_stateless/03001_consider_lwd_when_merge.reference b/tests/queries/0_stateless/03001_consider_lwd_when_merge.reference new file mode 100644 index 00000000000..19920de3d3c --- /dev/null +++ b/tests/queries/0_stateless/03001_consider_lwd_when_merge.reference @@ -0,0 +1,3 @@ +2 +2 +1 diff --git a/tests/queries/0_stateless/03001_consider_lwd_when_merge.sql b/tests/queries/0_stateless/03001_consider_lwd_when_merge.sql new file mode 100644 index 00000000000..a65e8877020 --- /dev/null +++ b/tests/queries/0_stateless/03001_consider_lwd_when_merge.sql @@ -0,0 +1,23 @@ +DROP TABLE IF EXISTS lwd_merge; + +CREATE TABLE lwd_merge (id UInt64 CODEC(NONE)) + ENGINE = MergeTree ORDER BY id +SETTINGS max_bytes_to_merge_at_max_space_in_pool = 80000, exclude_deleted_rows_for_part_size_in_merge = 0; + +INSERT INTO lwd_merge SELECT number FROM numbers(10000); +INSERT INTO lwd_merge SELECT number FROM numbers(10000, 10000); + +OPTIMIZE TABLE lwd_merge; +SELECT count() FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_merge' AND active = 1; + +DELETE FROM lwd_merge WHERE id % 10 > 0; + +OPTIMIZE TABLE lwd_merge; +SELECT count() FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_merge' AND active = 1; + +ALTER TABLE lwd_merge MODIFY SETTING exclude_deleted_rows_for_part_size_in_merge = 1; + +OPTIMIZE TABLE lwd_merge; +SELECT count() FROM system.parts WHERE database = currentDatabase() AND table = 'lwd_merge' AND active = 1; + +DROP TABLE IF EXISTS lwd_merge; From 4ad8141a162b3b7735e2f08c069e98b9c2ba2382 Mon Sep 17 00:00:00 2001 From: Zhuo Qiu Date: Wed, 28 Feb 2024 19:54:21 -0600 Subject: [PATCH 0238/1081] Maintain compatibility of estimateNeededDiskSpace() Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp | 4 ++-- src/Storages/MergeTree/MergeTreeDataMergerMutator.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp index 90144a8cc8f..53d49b51e8f 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp @@ -793,7 +793,7 @@ MergeTreeData::DataPartPtr MergeTreeDataMergerMutator::renameMergedTemporaryPart } -size_t MergeTreeDataMergerMutator::estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts, const bool & is_merge) +size_t MergeTreeDataMergerMutator::estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts, const bool & account_for_deleted) { size_t res = 0; time_t current_time = std::time(nullptr); @@ -804,7 +804,7 @@ size_t MergeTreeDataMergerMutator::estimateNeededDiskSpace(const MergeTreeData:: if (part_max_ttl && part_max_ttl <= current_time) continue; - if (is_merge) + if (account_for_deleted) res += part->getExistingBytesOnDisk(); else res += part->getBytesOnDisk(); diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h index 731c5e1d176..669ee040af3 100644 --- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h +++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h @@ -193,7 +193,7 @@ public: /// The approximate amount of disk space needed for merge or mutation. With a surplus. - static size_t estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts, const bool & is_merge); + static size_t estimateNeededDiskSpace(const MergeTreeData::DataPartsVector & source_parts, const bool & account_for_deleted = false); private: /** Select all parts belonging to the same partition. From 10c7ea7a29d8426fcf4d0ca09c778cdd3e56fbbd Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 12 Mar 2024 14:32:07 +0800 Subject: [PATCH 0239/1081] [debug] fast test again --- src/Storages/StorageMergeTree.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp index 055a48ad998..928ee094583 100644 --- a/src/Storages/StorageMergeTree.cpp +++ b/src/Storages/StorageMergeTree.cpp @@ -2083,7 +2083,6 @@ void StorageMergeTree::replacePartitionFrom(const StoragePtr & source_table, con MergeTreePartInfo dst_part_info(partition_id, temp_index, temp_index, src_part->info.level); IDataPartStorage::ClonePartParams clone_params{.txn = local_context->getCurrentTransaction()}; - bool on_same_disk = false; for (const DiskPtr & disk : this->getStoragePolicy()->getDisks()) if (disk->getName() == src_part->getDataPartStorage().getDiskName()) From 05969a39f390445c8d0df43b7077e0eb81db3538 Mon Sep 17 00:00:00 2001 From: Zhuo Qiu Date: Tue, 12 Mar 2024 14:45:25 +0800 Subject: [PATCH 0240/1081] resolve conflicts --- src/Storages/MergeTree/IMergeTreeDataPart.cpp | 5 +++-- src/Storages/MergeTree/MutateTask.cpp | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp index c099512d636..5fede923252 100644 --- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp +++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp @@ -1342,7 +1342,7 @@ UInt64 IMergeTreeDataPart::readExistingRowsCount() return rows_count; NamesAndTypesList cols; - cols.push_back(LightweightDeleteDescription::FILTER_COLUMN); + cols.emplace_back(RowExistsColumn::name, RowExistsColumn::type); StorageMetadataPtr metadata_ptr = storage.getInMemoryMetadataPtr(); StorageSnapshotPtr storage_snapshot_ptr = std::make_shared(storage, metadata_ptr); @@ -1351,7 +1351,8 @@ UInt64 IMergeTreeDataPart::readExistingRowsCount() cols, storage_snapshot_ptr, MarkRanges{MarkRange(0, total_mark)}, - nullptr, + /*virtual_fields=*/ {}, + /*uncompressed_cache=*/{}, storage.getContext()->getMarkCache().get(), std::make_shared(), MergeTreeReaderSettings{}, diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 3d31d2f05db..4d1e60f450e 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -62,7 +62,7 @@ static bool checkOperationIsNotCanceled(ActionBlocker & merges_blocker, MergeLis static UInt64 getExistingRowsCount(const Block & block) { - auto column = block.getByName(LightweightDeleteDescription::FILTER_COLUMN.name).column; + auto column = block.getByName(RowExistsColumn::name).column; const ColumnUInt8 * row_exists_col = typeid_cast(column.get()); if (!row_exists_col) @@ -2225,7 +2225,7 @@ bool MutateTask::prepare() if (ctx->mutating_pipeline_builder.initialized()) ctx->execute_ttl_type = MutationHelpers::shouldExecuteTTL(ctx->metadata_snapshot, ctx->interpreter->getColumnDependencies()); - if (ctx->updated_header.has(LightweightDeleteDescription::FILTER_COLUMN.name)) + if (ctx->updated_header.has(RowExistsColumn::name)) { /// This mutation contains lightweight delete, reset existing_rows_count of new data part to 0 /// It will be updated while writing _row_exists column From c1b94b2170acbf72d066928fd168c18dc571d505 Mon Sep 17 00:00:00 2001 From: unashi Date: Tue, 12 Mar 2024 15:33:43 +0800 Subject: [PATCH 0241/1081] [debug] fast test again again --- src/Storages/MergeTree/MergeTreeData.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp index 97968f1b9c1..7fc504d71f1 100644 --- a/src/Storages/MergeTree/MergeTreeData.cpp +++ b/src/Storages/MergeTree/MergeTreeData.cpp @@ -7174,7 +7174,7 @@ std::pair MergeTreeData::cloneAn } } if (!copy_successful) - LOG_ERROR(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail"); + LOG_ERROR(&Poco::Logger::get("MergeTreeData"), "Hard link fail, clone fail."); } From 5f1991fbef2f959f1d55c62194d948814d199fa9 Mon Sep 17 00:00:00 2001 From: lgbo-ustc Date: Tue, 12 Mar 2024 15:53:28 +0800 Subject: [PATCH 0242/1081] too big translation unit in Aggregator --- src/Common/HashTable/FixedHashMap.h | 3 + .../HashTable/TwoLevelStringHashTable.h | 1 + src/Interpreters/AggregatedData.h | 142 +++ src/Interpreters/AggregatedDataVariants.cpp | 255 ++++ src/Interpreters/AggregatedDataVariants.h | 320 +++++ src/Interpreters/AggregationMethod.cpp | 215 ++++ src/Interpreters/AggregationMethod.h | 320 +++++ src/Interpreters/Aggregator.cpp | 512 ++++---- src/Interpreters/Aggregator.h | 1035 +---------------- 9 files changed, 1541 insertions(+), 1262 deletions(-) create mode 100644 src/Interpreters/AggregatedData.h create mode 100644 src/Interpreters/AggregatedDataVariants.cpp create mode 100644 src/Interpreters/AggregatedDataVariants.h create mode 100644 src/Interpreters/AggregationMethod.cpp create mode 100644 src/Interpreters/AggregationMethod.h diff --git a/src/Common/HashTable/FixedHashMap.h b/src/Common/HashTable/FixedHashMap.h index e835a6fba94..537f37a9e6c 100644 --- a/src/Common/HashTable/FixedHashMap.h +++ b/src/Common/HashTable/FixedHashMap.h @@ -109,6 +109,9 @@ public: using Base::Base; + FixedHashMap() = default; + FixedHashMap(size_t ) {} /// NOLINT + template void ALWAYS_INLINE mergeToViaEmplace(Self & that, Func && func) { diff --git a/src/Common/HashTable/TwoLevelStringHashTable.h b/src/Common/HashTable/TwoLevelStringHashTable.h index 54c208c5b60..1ce6b3d02e3 100644 --- a/src/Common/HashTable/TwoLevelStringHashTable.h +++ b/src/Common/HashTable/TwoLevelStringHashTable.h @@ -38,6 +38,7 @@ public: Impl impls[NUM_BUCKETS]; TwoLevelStringHashTable() = default; + TwoLevelStringHashTable(size_t ) {} /// NOLINT template explicit TwoLevelStringHashTable(const Source & src) diff --git a/src/Interpreters/AggregatedData.h b/src/Interpreters/AggregatedData.h new file mode 100644 index 00000000000..6cd6b190801 --- /dev/null +++ b/src/Interpreters/AggregatedData.h @@ -0,0 +1,142 @@ +#pragma once +#include + +#include +#include +#include +#include +namespace DB +{ +/** Different data structures that can be used for aggregation + * For efficiency, the aggregation data itself is put into the pool. + * Data and pool ownership (states of aggregate functions) + * is acquired later - in `convertToBlocks` function, by the ColumnAggregateFunction object. + * + * Most data structures exist in two versions: normal and two-level (TwoLevel). + * A two-level hash table works a little slower with a small number of different keys, + * but with a large number of different keys scales better, because it allows + * parallelize some operations (merging, post-processing) in a natural way. + * + * To ensure efficient work over a wide range of conditions, + * first single-level hash tables are used, + * and when the number of different keys is large enough, + * they are converted to two-level ones. + * + * PS. There are many different approaches to the effective implementation of parallel and distributed aggregation, + * best suited for different cases, and this approach is just one of them, chosen for a combination of reasons. + */ + +using AggregatedDataWithoutKey = AggregateDataPtr; + +using AggregatedDataWithUInt8Key = FixedImplicitZeroHashMapWithCalculatedSize; +using AggregatedDataWithUInt16Key = FixedImplicitZeroHashMap; + +using AggregatedDataWithUInt32Key = HashMap>; +using AggregatedDataWithUInt64Key = HashMap>; + +using AggregatedDataWithShortStringKey = StringHashMap; + +using AggregatedDataWithStringKey = HashMapWithSavedHash; + +using AggregatedDataWithKeys128 = HashMap; +using AggregatedDataWithKeys256 = HashMap; + +using AggregatedDataWithUInt32KeyTwoLevel = TwoLevelHashMap>; +using AggregatedDataWithUInt64KeyTwoLevel = TwoLevelHashMap>; + +using AggregatedDataWithShortStringKeyTwoLevel = TwoLevelStringHashMap; + +using AggregatedDataWithStringKeyTwoLevel = TwoLevelHashMapWithSavedHash; + +using AggregatedDataWithKeys128TwoLevel = TwoLevelHashMap; +using AggregatedDataWithKeys256TwoLevel = TwoLevelHashMap; + +/** Variants with better hash function, using more than 32 bits for hash. + * Using for merging phase of external aggregation, where number of keys may be far greater than 4 billion, + * but we keep in memory and merge only sub-partition of them simultaneously. + * TODO We need to switch for better hash function not only for external aggregation, + * but also for huge aggregation results on machines with terabytes of RAM. + */ + +using AggregatedDataWithUInt64KeyHash64 = HashMap>; +using AggregatedDataWithStringKeyHash64 = HashMapWithSavedHash; +using AggregatedDataWithKeys128Hash64 = HashMap; +using AggregatedDataWithKeys256Hash64 = HashMap; + +template +struct AggregationDataWithNullKey : public Base +{ + using Base::Base; + + bool & hasNullKeyData() { return has_null_key_data; } + AggregateDataPtr & getNullKeyData() { return null_key_data; } + bool hasNullKeyData() const { return has_null_key_data; } + const AggregateDataPtr & getNullKeyData() const { return null_key_data; } + size_t size() const { return Base::size() + (has_null_key_data ? 1 : 0); } + bool empty() const { return Base::empty() && !has_null_key_data; } + void clear() + { + Base::clear(); + has_null_key_data = false; + } + void clearAndShrink() + { + Base::clearAndShrink(); + has_null_key_data = false; + } + +private: + bool has_null_key_data = false; + AggregateDataPtr null_key_data = nullptr; +}; + +template +struct AggregationDataWithNullKeyTwoLevel : public Base +{ + using Base::Base; + using Base::impls; + + AggregationDataWithNullKeyTwoLevel() = default; + + template + explicit AggregationDataWithNullKeyTwoLevel(const Other & other) : Base(other) + { + impls[0].hasNullKeyData() = other.hasNullKeyData(); + impls[0].getNullKeyData() = other.getNullKeyData(); + } + + bool & hasNullKeyData() { return impls[0].hasNullKeyData(); } + AggregateDataPtr & getNullKeyData() { return impls[0].getNullKeyData(); } + bool hasNullKeyData() const { return impls[0].hasNullKeyData(); } + const AggregateDataPtr & getNullKeyData() const { return impls[0].getNullKeyData(); } +}; + +template +using HashTableWithNullKey = AggregationDataWithNullKey>; +template +using StringHashTableWithNullKey = AggregationDataWithNullKey>; + +using AggregatedDataWithNullableUInt8Key = AggregationDataWithNullKey; +using AggregatedDataWithNullableUInt16Key = AggregationDataWithNullKey; +using AggregatedDataWithNullableUInt32Key = AggregationDataWithNullKey; + + +using AggregatedDataWithNullableUInt64Key = AggregationDataWithNullKey; +using AggregatedDataWithNullableStringKey = AggregationDataWithNullKey; +using AggregatedDataWithNullableShortStringKey = AggregationDataWithNullKey; + + +using AggregatedDataWithNullableUInt32KeyTwoLevel = AggregationDataWithNullKeyTwoLevel< + TwoLevelHashMap, + TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>; +using AggregatedDataWithNullableUInt64KeyTwoLevel = AggregationDataWithNullKeyTwoLevel< + TwoLevelHashMap, + TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>; + +using AggregatedDataWithNullableShortStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel< + TwoLevelStringHashMap>; + +using AggregatedDataWithNullableStringKeyTwoLevel = AggregationDataWithNullKeyTwoLevel< + TwoLevelHashMapWithSavedHash, + TwoLevelHashTableGrower<>, HashTableAllocator, HashTableWithNullKey>>; +} diff --git a/src/Interpreters/AggregatedDataVariants.cpp b/src/Interpreters/AggregatedDataVariants.cpp new file mode 100644 index 00000000000..0c86c58bd3e --- /dev/null +++ b/src/Interpreters/AggregatedDataVariants.cpp @@ -0,0 +1,255 @@ +#include +#include + +namespace ProfileEvents +{ + extern const Event AggregationPreallocatedElementsInHashTables; +} + +namespace DB +{ +namespace ErrorCodes +{ + extern const int UNKNOWN_AGGREGATED_DATA_VARIANT; + extern const int LOGICAL_ERROR; + +} +using ColumnsHashing::HashMethodContext; +using ColumnsHashing::HashMethodContextPtr; +using ColumnsHashing::LastElementCacheStats; + +AggregatedDataVariants::AggregatedDataVariants() : aggregates_pools(1, std::make_shared()), aggregates_pool(aggregates_pools.back().get()) {} + +AggregatedDataVariants::~AggregatedDataVariants() +{ + if (aggregator && !aggregator->all_aggregates_has_trivial_destructor) + { + try + { + aggregator->destroyAllAggregateStates(*this); + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + } + } +} + +// The std::is_constructible trait isn't suitable here because some classes have template constructors with semantics different from providing size hints. +// Also string hash table variants are not supported due to the fact that both local perf tests and tests in CI showed slowdowns for them. +template +struct HasConstructorOfNumberOfElements : std::false_type +{ +}; + +template +struct HasConstructorOfNumberOfElements> : std::true_type +{ +}; + +template typename ImplTable> +struct HasConstructorOfNumberOfElements> : std::true_type +{ +}; + +template +struct HasConstructorOfNumberOfElements> : std::true_type +{ +}; + +template +struct HasConstructorOfNumberOfElements> : std::true_type +{ +}; + +template