diff --git a/cmake/split_debug_symbols.cmake b/cmake/split_debug_symbols.cmake
index d6821eb6c48..67c2c386f20 100644
--- a/cmake/split_debug_symbols.cmake
+++ b/cmake/split_debug_symbols.cmake
@@ -1,3 +1,5 @@
+# Generates a separate file with debug symbols while stripping it from the main binary.
+# This is needed for Debian packages.
macro(clickhouse_split_debug_symbols)
set(oneValueArgs TARGET DESTINATION_DIR BINARY_PATH)
diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh
index cc2613cbaf5..150ce1ab385 100755
--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@@ -126,6 +126,7 @@ fi
mv ./programs/clickhouse* /output || mv ./programs/*_fuzzer /output
[ -x ./programs/self-extracting/clickhouse ] && mv ./programs/self-extracting/clickhouse /output
+[ -x ./programs/self-extracting/clickhouse-stripped ] && mv ./programs/self-extracting/clickhouse-stripped /output
mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds
mv ./programs/*.dict ./programs/*.options ./programs/*_seed_corpus.zip /output ||: # libFuzzer oss-fuzz compatible infrastructure
diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh
index 0ff79e24bf8..f09b8225da4 100755
--- a/docker/test/base/setup_export_logs.sh
+++ b/docker/test/base/setup_export_logs.sh
@@ -15,8 +15,8 @@ CLICKHOUSE_CI_LOGS_USER=${CLICKHOUSE_CI_LOGS_USER:-ci}
# Pre-configured destination cluster, where to export the data
CLICKHOUSE_CI_LOGS_CLUSTER=${CLICKHOUSE_CI_LOGS_CLUSTER:-system_logs_export}
-EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name String, instance_type String, instance_id String, "}
-EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, '' AS check_name, '' AS instance_type, '' AS instance_id"}
+EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name LowCardinality(String), instance_type LowCardinality(String), instance_id String, INDEX ix_pr (pull_request_number) TYPE set(100), INDEX ix_commit (commit_sha) TYPE set(100), INDEX ix_check_time (check_start_time) TYPE minmax, "}
+EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, toLowCardinality('') AS check_name, toLowCardinality('') AS instance_type, '' AS instance_id"}
EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "}
function __set_connection_args
@@ -127,7 +127,7 @@ function setup_logs_replication
do
# Calculate hash of its structure. Note: 4 is the version of extra columns - increment it if extra columns are changed:
hash=$(clickhouse-client --query "
- SELECT sipHash64(4, groupArray((name, type)))
+ SELECT sipHash64(7, groupArray((name, type)))
FROM (SELECT name, type FROM system.columns
WHERE database = 'system' AND table = '$table'
ORDER BY position)
diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md
index 0e2c0c00e4c..63f75fb7830 100644
--- a/docs/en/interfaces/http.md
+++ b/docs/en/interfaces/http.md
@@ -438,7 +438,7 @@ $ curl -v 'http://localhost:8123/predefined_query'
< X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a
< X-ClickHouse-Format: Template
< X-ClickHouse-Timezone: Asia/Shanghai
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
# HELP "Query" "Number of executing queries"
@@ -603,7 +603,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
* Connection #0 to host localhost left intact
@@ -643,7 +643,7 @@ $ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler'
< Connection: Keep-Alive
< Content-Type: text/plain; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
* Connection #0 to host localhost left intact
@@ -695,7 +695,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
Absolute Path File
@@ -714,7 +714,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
Relative Path File
diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md
index 7b5c4f27a2a..5aa634785aa 100644
--- a/docs/en/interfaces/third-party/client-libraries.md
+++ b/docs/en/interfaces/third-party/client-libraries.md
@@ -74,6 +74,7 @@ ClickHouse Inc does **not** maintain the libraries listed below and hasn’t don
### Elixir
- [clickhousex](https://github.com/appodeal/clickhousex/)
- [pillar](https://github.com/sofakingworld/pillar)
+ - [ecto_ch](https://github.com/plausible/ecto_ch)
### Nim
- [nim-clickhouse](https://github.com/leonardoce/nim-clickhouse)
### Haskell
diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index d5791d56191..98636a653fb 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -214,7 +214,7 @@ Max consecutive resolving failures before dropping a host from ClickHouse DNS ca
Type: UInt32
-Default: 1024
+Default: 10
## index_mark_cache_policy
diff --git a/docs/en/sql-reference/data-types/float.md b/docs/en/sql-reference/data-types/float.md
index 3b55271f707..f1b99153b41 100644
--- a/docs/en/sql-reference/data-types/float.md
+++ b/docs/en/sql-reference/data-types/float.md
@@ -16,7 +16,7 @@ CREATE TABLE IF NOT EXISTS float_vs_decimal
my_decimal Decimal64(3)
)Engine=MergeTree ORDER BY tuple()
-INSERT INTO float_vs_decimal SELECT round(canonicalRand(), 3) AS res, res FROM system.numbers LIMIT 1000000; # Generate 1 000 000 random number with 2 decimal places and store them as a float and as a decimal
+INSERT INTO float_vs_decimal SELECT round(randCanonical(), 3) AS res, res FROM system.numbers LIMIT 1000000; # Generate 1 000 000 random number with 2 decimal places and store them as a float and as a decimal
SELECT sum(my_float), sum(my_decimal) FROM float_vs_decimal;
> 500279.56300000014 500279.563
diff --git a/docs/ru/interfaces/http.md b/docs/ru/interfaces/http.md
index 16927408bc4..be8cfbdda6c 100644
--- a/docs/ru/interfaces/http.md
+++ b/docs/ru/interfaces/http.md
@@ -366,7 +366,7 @@ $ curl -v 'http://localhost:8123/predefined_query'
< X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a
< X-ClickHouse-Format: Template
< X-ClickHouse-Timezone: Asia/Shanghai
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0"}
<
# HELP "Query" "Number of executing queries"
@@ -529,7 +529,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
* Connection #0 to host localhost left intact
@@ -569,7 +569,7 @@ $ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler'
< Connection: Keep-Alive
< Content-Type: text/plain; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
* Connection #0 to host localhost left intact
@@ -621,7 +621,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
Absolute Path File
@@ -640,7 +640,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
Relative Path File
diff --git a/docs/zh/interfaces/http.md b/docs/zh/interfaces/http.md
index dfdcf53bd3f..84ca5ed0c47 100644
--- a/docs/zh/interfaces/http.md
+++ b/docs/zh/interfaces/http.md
@@ -362,7 +362,7 @@ $ curl -v 'http://localhost:8123/predefined_query'
< X-ClickHouse-Query-Id: 96fe0052-01e6-43ce-b12a-6b7370de6e8a
< X-ClickHouse-Format: Template
< X-ClickHouse-Timezone: Asia/Shanghai
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
# HELP "Query" "Number of executing queries"
@@ -520,7 +520,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/hi'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
* Connection #0 to host localhost left intact
@@ -560,7 +560,7 @@ $ curl -v -H 'XXX:xxx' 'http://localhost:8123/get_config_static_handler'
< Connection: Keep-Alive
< Content-Type: text/plain; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
* Connection #0 to host localhost left intact
@@ -612,7 +612,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_absolute_path_static_handler'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
Absolute Path File
@@ -631,7 +631,7 @@ $ curl -vv -H 'XXX:xxx' 'http://localhost:8123/get_relative_path_static_handler'
< Connection: Keep-Alive
< Content-Type: text/html; charset=UTF-8
< Transfer-Encoding: chunked
-< Keep-Alive: timeout=3
+< Keep-Alive: timeout=10
< X-ClickHouse-Summary: {"read_rows":"0","read_bytes":"0","written_rows":"0","written_bytes":"0","total_rows_to_read":"0","elapsed_ns":"662334"}
<
Relative Path File
diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt
index eb4a898d472..eb117e74f6b 100644
--- a/programs/CMakeLists.txt
+++ b/programs/CMakeLists.txt
@@ -439,6 +439,13 @@ else()
install (TARGETS clickhouse RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse)
endif()
+# A target to get stripped binary.
+# Note: this is different to the above (extract debug symbols to a separate place)
+add_custom_target(clickhouse-stripped ALL
+ COMMAND "${STRIP_PATH}" -o "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-stripped" --strip-debug --remove-section=.comment --remove-section=.note "${CMAKE_CURRENT_BINARY_DIR}/clickhouse"
+ DEPENDS clickhouse
+ COMMENT "Stripping clickhouse binary" VERBATIM)
+
if (ENABLE_TESTS)
set (CLICKHOUSE_UNIT_TESTS_TARGETS unit_tests_dbms)
add_custom_target (clickhouse-tests ALL DEPENDS ${CLICKHOUSE_UNIT_TESTS_TARGETS})
diff --git a/programs/self-extracting/CMakeLists.txt b/programs/self-extracting/CMakeLists.txt
index f3ff0bbcd78..4b6dd07f618 100644
--- a/programs/self-extracting/CMakeLists.txt
+++ b/programs/self-extracting/CMakeLists.txt
@@ -11,8 +11,8 @@ else ()
endif ()
add_custom_target (self-extracting ALL
- ${CMAKE_COMMAND} -E remove clickhouse
+ ${CMAKE_COMMAND} -E remove clickhouse clickhouse-stripped
COMMAND ${COMPRESSOR} ${DECOMPRESSOR} clickhouse ../clickhouse
- DEPENDS clickhouse compressor
+ COMMAND ${COMPRESSOR} ${DECOMPRESSOR} clickhouse-stripped ../clickhouse-stripped
+ DEPENDS clickhouse clickhouse-stripped compressor
)
-
diff --git a/programs/server/config.xml b/programs/server/config.xml
index d0bf1c7d66a..d7ad1545201 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -104,15 +104,14 @@
-
-
+
- 3
+ 10
diff --git a/src/AggregateFunctions/AggregateFunctionUniq.h b/src/AggregateFunctions/AggregateFunctionUniq.h
index 1752d5751d5..f20fb8cb933 100644
--- a/src/AggregateFunctions/AggregateFunctionUniq.h
+++ b/src/AggregateFunctions/AggregateFunctionUniq.h
@@ -466,7 +466,7 @@ public:
std::vector data_vec;
data_vec.resize(places.size());
- for (unsigned long i = 0; i < data_vec.size(); i++)
+ for (size_t i = 0; i < data_vec.size(); ++i)
data_vec[i] = &this->data(places[i]).set;
DataSet::parallelizeMergePrepare(data_vec, thread_pool);
diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp
index c6fbd728b8f..7855c4f34a8 100644
--- a/src/Analyzer/Passes/QueryAnalysisPass.cpp
+++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp
@@ -1467,9 +1467,15 @@ ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNod
const ProjectionNames & arguments_projection_names)
{
const auto & function_node_typed = function_node->as();
+ const auto & function_node_name = function_node_typed.getFunctionName();
+
+ bool is_array_function = function_node_name == "array";
+ bool is_tuple_function = function_node_name == "tuple";
WriteBufferFromOwnString buffer;
- buffer << function_node_typed.getFunctionName();
+
+ if (!is_array_function && !is_tuple_function)
+ buffer << function_node_name;
if (!parameters_projection_names.empty())
{
@@ -1487,7 +1493,16 @@ ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNod
buffer << ')';
}
- buffer << '(';
+ char open_bracket = '(';
+ char close_bracket = ')';
+
+ if (is_array_function)
+ {
+ open_bracket = '[';
+ close_bracket = ']';
+ }
+
+ buffer << open_bracket;
size_t function_arguments_projection_names_size = arguments_projection_names.size();
for (size_t i = 0; i < function_arguments_projection_names_size; ++i)
@@ -1498,7 +1513,7 @@ ProjectionName QueryAnalyzer::calculateFunctionProjectionName(const QueryTreeNod
buffer << ", ";
}
- buffer << ')';
+ buffer << close_bracket;
return buffer.str();
}
diff --git a/src/Core/ServerSettings.h b/src/Core/ServerSettings.h
index 34a6296fb55..de0fff35389 100644
--- a/src/Core/ServerSettings.h
+++ b/src/Core/ServerSettings.h
@@ -75,7 +75,7 @@ namespace DB
\
M(Bool, disable_internal_dns_cache, false, "Disable internal DNS caching at all.", 0) \
M(Int32, dns_cache_update_period, 15, "Internal DNS cache update period in seconds.", 0) \
- M(UInt32, dns_max_consecutive_failures, 1024, "Max DNS resolve failures of a hostname before dropping the hostname from ClickHouse DNS cache.", 0) \
+ M(UInt32, dns_max_consecutive_failures, 10, "Max DNS resolve failures of a hostname before dropping the hostname from ClickHouse DNS cache.", 0) \
\
M(UInt64, max_table_size_to_drop, 50000000000lu, "If size of a table is greater than this value (in bytes) than table could not be dropped with any DROP query.", 0) \
M(UInt64, max_partition_size_to_drop, 50000000000lu, "Same as max_table_size_to_drop, but for the partitions.", 0) \
diff --git a/src/Dictionaries/RangeHashedDictionary.h b/src/Dictionaries/RangeHashedDictionary.h
index 624a57d65b5..9be9fa1d0d4 100644
--- a/src/Dictionaries/RangeHashedDictionary.h
+++ b/src/Dictionaries/RangeHashedDictionary.h
@@ -227,9 +227,7 @@ private:
struct KeyAttribute final
{
RangeStorageTypeContainer container;
-
RangeStorageTypeContainer invalid_intervals_container;
-
};
void createAttributes();
diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
index e923e49a7f1..3271a190193 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.cpp
@@ -117,6 +117,9 @@ void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const
if (storage_metadata_write_full_object_key)
write_version = VERSION_FULL_OBJECT_KEY;
+ if (!inline_data.empty() && write_version < VERSION_INLINE_DATA)
+ write_version = VERSION_INLINE_DATA;
+
chassert(write_version >= VERSION_ABSOLUTE_PATHS && write_version <= VERSION_FULL_OBJECT_KEY);
writeIntText(write_version, buf);
writeChar('\n', buf);
@@ -153,8 +156,11 @@ void DiskObjectStorageMetadata::serialize(WriteBuffer & buf, bool sync) const
writeBoolText(read_only, buf);
writeChar('\n', buf);
- writeEscapedString(inline_data, buf);
- writeChar('\n', buf);
+ if (write_version >= VERSION_INLINE_DATA)
+ {
+ writeEscapedString(inline_data, buf);
+ writeChar('\n', buf);
+ }
buf.finalize();
if (sync)
diff --git a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
index 658914b7611..729d93af10d 100644
--- a/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
+++ b/src/Disks/ObjectStorages/DiskObjectStorageMetadata.h
@@ -19,7 +19,7 @@ private:
static constexpr UInt32 VERSION_INLINE_DATA = 4;
static constexpr UInt32 VERSION_FULL_OBJECT_KEY = 5; /// only for reading data
- UInt32 version = VERSION_INLINE_DATA;
+ UInt32 version = VERSION_READ_ONLY_FLAG;
/// Absolute paths of blobs
ObjectKeysWithMetadata keys_with_meta;
diff --git a/src/Functions/CMakeLists.txt b/src/Functions/CMakeLists.txt
index 31f7f24eb13..57904a8ca1c 100644
--- a/src/Functions/CMakeLists.txt
+++ b/src/Functions/CMakeLists.txt
@@ -9,6 +9,7 @@ extract_into_parent_list(clickhouse_functions_sources dbms_sources
FunctionHelpers.cpp
extractTimeZoneFromFunctionArguments.cpp
FunctionsLogical.cpp
+ CastOverloadResolver.cpp
)
extract_into_parent_list(clickhouse_functions_headers dbms_headers
IFunction.h
@@ -16,6 +17,7 @@ extract_into_parent_list(clickhouse_functions_headers dbms_headers
FunctionHelpers.h
extractTimeZoneFromFunctionArguments.h
FunctionsLogical.h
+ CastOverloadResolver.h
)
add_library(clickhouse_functions_obj OBJECT ${clickhouse_functions_headers} ${clickhouse_functions_sources})
diff --git a/src/Functions/CastOverloadResolver.cpp b/src/Functions/CastOverloadResolver.cpp
index 20a08e3b60b..7fc46db50f1 100644
--- a/src/Functions/CastOverloadResolver.cpp
+++ b/src/Functions/CastOverloadResolver.cpp
@@ -1,10 +1,156 @@
#include
+#include
#include
+#include
namespace DB
{
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_TYPE_OF_ARGUMENT;
+}
+
+/** CastInternal does not preserve nullability of the data type,
+ * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1).
+ *
+ * Cast preserves nullability according to setting `cast_keep_nullable`,
+ * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1.
+ */
+template
+class CastOverloadResolverImpl : public IFunctionOverloadResolver
+{
+public:
+ using MonotonicityForRange = FunctionCastBase::MonotonicityForRange;
+
+ static constexpr auto name = cast_type == CastType::accurate
+ ? CastName::accurate_cast_name
+ : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name);
+
+ String getName() const override { return name; }
+
+ size_t getNumberOfArguments() const override { return 2; }
+
+ ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
+
+ explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_)
+ : context(context_)
+ , diagnostic(std::move(diagnostic_))
+ , keep_nullable(keep_nullable_)
+ , data_type_validation_settings(data_type_validation_settings_)
+ {
+ }
+
+ static FunctionOverloadResolverPtr create(ContextPtr context)
+ {
+ const auto & settings_ref = context->getSettingsRef();
+
+ if constexpr (internal)
+ return createImpl(context, {}, false /*keep_nullable*/);
+
+ return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref));
+ }
+
+ static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {})
+ {
+ assert(!internal || !keep_nullable);
+ return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings);
+ }
+
+ static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {})
+ {
+ assert(!internal || !keep_nullable);
+ return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings);
+ }
+
+protected:
+
+ FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
+ {
+ DataTypes data_types(arguments.size());
+
+ for (size_t i = 0; i < arguments.size(); ++i)
+ data_types[i] = arguments[i].type;
+
+ auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get());
+ return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type);
+ }
+
+ DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+ {
+ const auto & column = arguments.back().column;
+ if (!column)
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. "
+ "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName());
+
+ const auto * type_col = checkAndGetColumnConst(column.get());
+ if (!type_col)
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. "
+ "Instead there is a column with the following structure: {}", getName(), column->dumpStructure());
+
+ DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue());
+ validateDataType(type, data_type_validation_settings);
+
+ if constexpr (cast_type == CastType::accurateOrNull)
+ return makeNullable(type);
+
+ if constexpr (internal)
+ return type;
+
+ if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable())
+ return makeNullable(type);
+
+ return type;
+ }
+
+ bool useDefaultImplementationForNulls() const override { return false; }
+ bool useDefaultImplementationForNothing() const override { return false; }
+ bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
+
+private:
+ ContextPtr context;
+ std::optional diagnostic;
+ bool keep_nullable;
+ DataTypeValidationSettings data_type_validation_settings;
+};
+
+
+struct CastOverloadName
+{
+ static constexpr auto cast_name = "CAST";
+ static constexpr auto accurate_cast_name = "accurateCast";
+ static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull";
+};
+
+struct CastInternalOverloadName
+{
+ static constexpr auto cast_name = "_CAST";
+ static constexpr auto accurate_cast_name = "accurate_Cast";
+ static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull";
+};
+
+template
+using CastOverloadResolver = CastOverloadResolverImpl;
+
+template
+using CastInternalOverloadResolver = CastOverloadResolverImpl;
+
+
+FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic)
+{
+ switch (type)
+ {
+ case CastType::nonAccurate:
+ return CastInternalOverloadResolver::createImpl(diagnostic);
+ case CastType::accurate:
+ return CastInternalOverloadResolver::createImpl(diagnostic);
+ case CastType::accurateOrNull:
+ return CastInternalOverloadResolver::createImpl(diagnostic);
+ }
+}
+
+
REGISTER_FUNCTION(CastOverloadResolvers)
{
factory.registerFunction>({}, FunctionFactory::CaseInsensitive);
diff --git a/src/Functions/CastOverloadResolver.h b/src/Functions/CastOverloadResolver.h
index 670cd364a29..4346478e5b6 100644
--- a/src/Functions/CastOverloadResolver.h
+++ b/src/Functions/CastOverloadResolver.h
@@ -1,138 +1,29 @@
#pragma once
-#include
-#include
+
+#include
+#include
+#include
+
namespace DB
{
-namespace ErrorCodes
+class IFunctionOverloadResolver;
+using FunctionOverloadResolverPtr = std::shared_ptr;
+
+enum class CastType
{
- extern const int ILLEGAL_TYPE_OF_ARGUMENT;
-}
-
-/** CastInternal does not preserve nullability of the data type,
- * i.e. CastInternal(toNullable(toInt8(1)) as Int32) will be Int32(1).
- *
- * Cast preserves nullability according to setting `cast_keep_nullable`,
- * i.e. Cast(toNullable(toInt8(1)) as Int32) will be Nullable(Int32(1)) if `cast_keep_nullable` == 1.
- */
-template
-class CastOverloadResolverImpl : public IFunctionOverloadResolver
-{
-public:
- using MonotonicityForRange = FunctionCastBase::MonotonicityForRange;
- using Diagnostic = FunctionCastBase::Diagnostic;
-
- static constexpr auto name = cast_type == CastType::accurate
- ? CastName::accurate_cast_name
- : (cast_type == CastType::accurateOrNull ? CastName::accurate_cast_or_null_name : CastName::cast_name);
-
- String getName() const override { return name; }
-
- size_t getNumberOfArguments() const override { return 2; }
-
- ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
-
- explicit CastOverloadResolverImpl(ContextPtr context_, std::optional diagnostic_, bool keep_nullable_, const DataTypeValidationSettings & data_type_validation_settings_)
- : context(context_)
- , diagnostic(std::move(diagnostic_))
- , keep_nullable(keep_nullable_)
- , data_type_validation_settings(data_type_validation_settings_)
- {
- }
-
- static FunctionOverloadResolverPtr create(ContextPtr context)
- {
- const auto & settings_ref = context->getSettingsRef();
-
- if constexpr (internal)
- return createImpl(context, {}, false /*keep_nullable*/);
-
- return createImpl(context, {}, settings_ref.cast_keep_nullable, DataTypeValidationSettings(settings_ref));
- }
-
- static FunctionOverloadResolverPtr createImpl(ContextPtr context, std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {})
- {
- assert(!internal || !keep_nullable);
- return std::make_unique(context, std::move(diagnostic), keep_nullable, data_type_validation_settings);
- }
-
- static FunctionOverloadResolverPtr createImpl(std::optional diagnostic = {}, bool keep_nullable = false, const DataTypeValidationSettings & data_type_validation_settings = {})
- {
- assert(!internal || !keep_nullable);
- return std::make_unique(ContextPtr(), std::move(diagnostic), keep_nullable, data_type_validation_settings);
- }
-
-protected:
-
- FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & return_type) const override
- {
- DataTypes data_types(arguments.size());
-
- for (size_t i = 0; i < arguments.size(); ++i)
- data_types[i] = arguments[i].type;
-
- auto monotonicity = MonotonicityHelper::getMonotonicityInformation(arguments.front().type, return_type.get());
- return std::make_unique>(context, name, std::move(monotonicity), data_types, return_type, diagnostic, cast_type);
- }
-
- DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
- {
- const auto & column = arguments.back().column;
- if (!column)
- throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. "
- "Instead there is non-constant column of type {}", getName(), arguments.back().type->getName());
-
- const auto * type_col = checkAndGetColumnConst(column.get());
- if (!type_col)
- throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Second argument to {} must be a constant string describing type. "
- "Instead there is a column with the following structure: {}", getName(), column->dumpStructure());
-
- DataTypePtr type = DataTypeFactory::instance().get(type_col->getValue());
- validateDataType(type, data_type_validation_settings);
-
- if constexpr (cast_type == CastType::accurateOrNull)
- return makeNullable(type);
-
- if constexpr (internal)
- return type;
-
- if (keep_nullable && arguments.front().type->isNullable() && type->canBeInsideNullable())
- return makeNullable(type);
-
- return type;
- }
-
- bool useDefaultImplementationForNulls() const override { return false; }
- bool useDefaultImplementationForNothing() const override { return false; }
- bool useDefaultImplementationForLowCardinalityColumns() const override { return false; }
-
-private:
- ContextPtr context;
- std::optional diagnostic;
- bool keep_nullable;
- DataTypeValidationSettings data_type_validation_settings;
+ nonAccurate,
+ accurate,
+ accurateOrNull
};
-
-struct CastOverloadName
+struct CastDiagnostic
{
- static constexpr auto cast_name = "CAST";
- static constexpr auto accurate_cast_name = "accurateCast";
- static constexpr auto accurate_cast_or_null_name = "accurateCastOrNull";
+ std::string column_from;
+ std::string column_to;
};
-struct CastInternalOverloadName
-{
- static constexpr auto cast_name = "_CAST";
- static constexpr auto accurate_cast_name = "accurate_Cast";
- static constexpr auto accurate_cast_or_null_name = "accurate_CastOrNull";
-};
-
-template
-using CastOverloadResolver = CastOverloadResolverImpl;
-
-template
-using CastInternalOverloadResolver = CastOverloadResolverImpl;
+FunctionOverloadResolverPtr createInternalCastOverloadResolver(CastType type, std::optional diagnostic);
}
diff --git a/src/Functions/FunctionTokens.h b/src/Functions/FunctionTokens.h
new file mode 100644
index 00000000000..5c4e582c637
--- /dev/null
+++ b/src/Functions/FunctionTokens.h
@@ -0,0 +1,211 @@
+#pragma once
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+}
+
+
+/** Functions that split strings into an array of strings or vice versa.
+ *
+ * splitByChar(sep, s[, max_substrings])
+ * splitByString(sep, s[, max_substrings])
+ * splitByRegexp(regexp, s[, max_substrings])
+ *
+ * splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters
+ * splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters
+ *
+ * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
+ * - first subpattern, if regexp has subpattern;
+ * - zero subpattern (the match part, otherwise);
+ * - otherwise, an empty array
+ *
+ * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`.
+ *
+ * URL functions are located separately.
+ */
+
+
+/// A function that takes a string, and returns an array of substrings created by some generator.
+template
+class FunctionTokens : public IFunction
+{
+private:
+ using Pos = const char *;
+ bool max_substrings_includes_remaining_string;
+
+public:
+ static constexpr auto name = Generator::name;
+ static FunctionPtr create(ContextPtr context) { return std::make_shared(context); }
+
+ explicit FunctionTokens(ContextPtr context)
+ {
+ const Settings & settings = context->getSettingsRef();
+ max_substrings_includes_remaining_string = settings.splitby_max_substrings_includes_remaining_string;
+ }
+
+ String getName() const override { return name; }
+
+ bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+
+ bool isVariadic() const override { return Generator::isVariadic(); }
+
+ size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); }
+
+ DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+ {
+ Generator::checkArguments(*this, arguments);
+
+ return std::make_shared(std::make_shared());
+ }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
+ {
+ Generator generator;
+ generator.init(arguments, max_substrings_includes_remaining_string);
+
+ const auto & array_argument = arguments[generator.strings_argument_position];
+
+ const ColumnString * col_str = checkAndGetColumn(array_argument.column.get());
+ const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
+
+ auto col_res = ColumnArray::create(ColumnString::create());
+
+ ColumnString & res_strings = typeid_cast(col_res->getData());
+ ColumnString::Chars & res_strings_chars = res_strings.getChars();
+ ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
+
+ ColumnArray::Offsets & res_offsets = col_res->getOffsets();
+
+ if (col_str)
+ {
+ const ColumnString::Chars & src_chars = col_str->getChars();
+ const ColumnString::Offsets & src_offsets = col_str->getOffsets();
+
+ res_offsets.reserve(src_offsets.size());
+ res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random.
+ res_strings_chars.reserve(src_chars.size());
+
+ Pos token_begin = nullptr;
+ Pos token_end = nullptr;
+
+ size_t size = src_offsets.size();
+ ColumnString::Offset current_src_offset = 0;
+ ColumnArray::Offset current_dst_offset = 0;
+ ColumnString::Offset current_dst_strings_offset = 0;
+ for (size_t i = 0; i < size; ++i)
+ {
+ Pos pos = reinterpret_cast(&src_chars[current_src_offset]);
+ current_src_offset = src_offsets[i];
+ Pos end = reinterpret_cast(&src_chars[current_src_offset]) - 1;
+
+ generator.set(pos, end);
+ size_t j = 0;
+ while (generator.get(token_begin, token_end))
+ {
+ size_t token_size = token_end - token_begin;
+
+ res_strings_chars.resize(res_strings_chars.size() + token_size + 1);
+ memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size);
+ res_strings_chars[current_dst_strings_offset + token_size] = 0;
+
+ current_dst_strings_offset += token_size + 1;
+ res_strings_offsets.push_back(current_dst_strings_offset);
+ ++j;
+ }
+
+ current_dst_offset += j;
+ res_offsets.push_back(current_dst_offset);
+ }
+
+ return col_res;
+ }
+ else if (col_str_const)
+ {
+ String src = col_str_const->getValue();
+ Array dst;
+
+ generator.set(src.data(), src.data() + src.size());
+ Pos token_begin = nullptr;
+ Pos token_end = nullptr;
+
+ while (generator.get(token_begin, token_end))
+ dst.push_back(String(token_begin, token_end - token_begin));
+
+ return result_type->createColumnConst(col_str_const->size(), dst);
+ }
+ else
+ throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}",
+ array_argument.column->getName(), array_argument.column->getName(), getName());
+ }
+};
+
+
+/// Helper functions for implementations
+static inline std::optional extractMaxSplits(
+ const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position)
+{
+ if (max_substrings_argument_position >= arguments.size())
+ return std::nullopt;
+
+ if (const ColumnConst * column = checkAndGetColumn(arguments[max_substrings_argument_position].column.get()))
+ {
+ size_t res = column->getUInt(0);
+ if (res)
+ return res;
+ }
+
+ return std::nullopt;
+}
+
+static inline void checkArgumentsWithSeparatorAndOptionalMaxSubstrings(
+ const IFunction & func, const ColumnsWithTypeAndName & arguments)
+{
+ FunctionArgumentDescriptors mandatory_args{
+ {"separator", &isString, isColumnConst, "const String"},
+ {"s", &isString, nullptr, "String"}
+ };
+
+ FunctionArgumentDescriptors optional_args{
+ {"max_substrings", &isNativeInteger, isColumnConst, "const Number"},
+ };
+
+ validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args);
+}
+
+static inline void checkArgumentsWithOptionalMaxSubstrings(const IFunction & func, const ColumnsWithTypeAndName & arguments)
+{
+ FunctionArgumentDescriptors mandatory_args{
+ {"s", &isString, nullptr, "String"},
+ };
+
+ FunctionArgumentDescriptors optional_args{
+ {"max_substrings", &isNativeInteger, isColumnConst, "const Number"},
+ };
+
+ validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args);
+}
+
+}
diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index d5f1f175a37..b0a262ff36b 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -53,6 +53,7 @@
#include
#include
#include
+#include
#include
#include
#include
@@ -3127,14 +3128,8 @@ class ExecutableFunctionCast : public IExecutableFunction
public:
using WrapperType = std::function;
- struct Diagnostic
- {
- std::string column_from;
- std::string column_to;
- };
-
explicit ExecutableFunctionCast(
- WrapperType && wrapper_function_, const char * name_, std::optional diagnostic_)
+ WrapperType && wrapper_function_, const char * name_, std::optional diagnostic_)
: wrapper_function(std::move(wrapper_function_)), name(name_), diagnostic(std::move(diagnostic_)) {}
String getName() const override { return name; }
@@ -3170,24 +3165,16 @@ protected:
private:
WrapperType wrapper_function;
const char * name;
- std::optional diagnostic;
+ std::optional diagnostic;
};
struct CastName { static constexpr auto name = "CAST"; };
struct CastInternalName { static constexpr auto name = "_CAST"; };
-enum class CastType
-{
- nonAccurate,
- accurate,
- accurateOrNull
-};
-
class FunctionCastBase : public IFunctionBase
{
public:
using MonotonicityForRange = std::function;
- using Diagnostic = ExecutableFunctionCast::Diagnostic;
};
template
@@ -3201,7 +3188,7 @@ public:
, MonotonicityForRange && monotonicity_for_range_
, const DataTypes & argument_types_
, const DataTypePtr & return_type_
- , std::optional diagnostic_
+ , std::optional diagnostic_
, CastType cast_type_)
: cast_name(cast_name_), monotonicity_for_range(std::move(monotonicity_for_range_))
, argument_types(argument_types_), return_type(return_type_), diagnostic(std::move(diagnostic_))
@@ -3251,7 +3238,7 @@ private:
DataTypes argument_types;
DataTypePtr return_type;
- std::optional diagnostic;
+ std::optional diagnostic;
CastType cast_type;
ContextPtr context;
diff --git a/src/Functions/FunctionsStringArray.cpp b/src/Functions/FunctionsStringArray.cpp
deleted file mode 100644
index 4afee55704f..00000000000
--- a/src/Functions/FunctionsStringArray.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include
-#include
-
-namespace DB
-{
-namespace ErrorCodes
-{
- extern const int ILLEGAL_COLUMN;
-}
-
-template
-std::optional extractMaxSplitsImpl(const ColumnWithTypeAndName & argument)
-{
- const auto * col = checkAndGetColumnConst>(argument.column.get());
- if (!col)
- return std::nullopt;
-
- auto value = col->template getValue();
- return static_cast(value);
-}
-
-std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position)
-{
- if (max_substrings_argument_position >= arguments.size())
- return std::nullopt;
-
- std::optional max_splits;
- if (!((max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position]))
- || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position]))
- || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position]))
- || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position])) || (max_splits = extractMaxSplitsImpl(arguments[max_substrings_argument_position]))))
- throw Exception(
- ErrorCodes::ILLEGAL_COLUMN,
- "Illegal column {}, which is {}-th argument",
- arguments[max_substrings_argument_position].column->getName(),
- max_substrings_argument_position + 1);
-
- if (*max_splits <= 0)
- return std::nullopt;
-
- return max_splits;
-}
-
-DataTypePtr FunctionArrayStringConcat::getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const
-{
- FunctionArgumentDescriptors mandatory_args{
- {"arr", &isArray, nullptr, "Array"},
- };
-
- FunctionArgumentDescriptors optional_args{
- {"separator", &isString, isColumnConst, "const String"},
- };
-
- validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
-
- return std::make_shared();
-}
-
-REGISTER_FUNCTION(StringArray)
-{
- factory.registerFunction();
-
- factory.registerFunction();
- factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name);
- factory.registerFunction();
- factory.registerFunction();
- factory.registerFunction();
- factory.registerFunction();
- factory.registerFunction();
- factory.registerFunction();
-}
-
-}
diff --git a/src/Functions/FunctionsStringArray.h b/src/Functions/FunctionsStringArray.h
deleted file mode 100644
index d7d7e3b5100..00000000000
--- a/src/Functions/FunctionsStringArray.h
+++ /dev/null
@@ -1,990 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-#include
-
-namespace DB
-{
-
-namespace ErrorCodes
-{
- extern const int BAD_ARGUMENTS;
- extern const int ILLEGAL_COLUMN;
-}
-
-
-/** Functions that split strings into an array of strings or vice versa.
- *
- * splitByChar(sep, s[, max_substrings])
- * splitByString(sep, s[, max_substrings])
- * splitByRegexp(regexp, s[, max_substrings])
- *
- * splitByWhitespace(s[, max_substrings]) - split the string by whitespace characters
- * splitByNonAlpha(s[, max_substrings]) - split the string by whitespace and punctuation characters
- *
- * extractAll(s, regexp) - select from the string the subsequences corresponding to the regexp.
- * - first subpattern, if regexp has subpattern;
- * - zero subpattern (the match part, otherwise);
- * - otherwise, an empty array
- *
- * arrayStringConcat(arr)
- * arrayStringConcat(arr, delimiter)
- * - join an array of strings into one string via a separator.
- *
- * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`.
- *
- * URL functions are located separately.
- */
-
-
-using Pos = const char *;
-
-std::optional extractMaxSplits(const ColumnsWithTypeAndName & arguments, size_t max_substrings_argument_position);
-
-/// Substring generators. All of them have a common interface.
-
-class SplitByAlphaImpl
-{
-private:
- Pos pos;
- Pos end;
- std::optional max_splits;
- size_t splits;
- bool max_substrings_includes_remaining_string;
-
-public:
- static constexpr auto name = "alphaTokens";
- static String getName() { return name; }
-
- static bool isVariadic() { return true; }
-
- static size_t getNumberOfArguments() { return 0; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- FunctionArgumentDescriptors mandatory_args{
- {"s", &isString, nullptr, "String"},
- };
-
- FunctionArgumentDescriptors optional_args{
- {"max_substrings", &isNativeInteger, isColumnConst, "const Number"},
- };
-
- validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args);
- }
-
- static constexpr auto strings_argument_position = 0uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
- {
- max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
- max_splits = extractMaxSplits(arguments, 1);
- }
-
- /// Called for each next string.
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- splits = 0;
- }
-
- /// Get the next token, if any, or return false.
- bool get(Pos & token_begin, Pos & token_end)
- {
- /// Skip garbage
- while (pos < end && !isAlphaASCII(*pos))
- ++pos;
-
- if (pos == end)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = end;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- while (pos < end && isAlphaASCII(*pos))
- ++pos;
-
- token_end = pos;
- ++splits;
-
- return true;
- }
-};
-
-class SplitByNonAlphaImpl
-{
-private:
- Pos pos;
- Pos end;
- std::optional max_splits;
- size_t splits;
- bool max_substrings_includes_remaining_string;
-
-public:
- /// Get the name of the function.
- static constexpr auto name = "splitByNonAlpha";
- static String getName() { return name; }
-
- static bool isVariadic() { return true; }
- static size_t getNumberOfArguments() { return 0; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- SplitByAlphaImpl::checkArguments(func, arguments);
- }
-
- static constexpr auto strings_argument_position = 0uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
- {
- max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
- max_splits = extractMaxSplits(arguments, 1);
- }
-
- /// Called for each next string.
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- splits = 0;
- }
-
- /// Get the next token, if any, or return false.
- bool get(Pos & token_begin, Pos & token_end)
- {
- /// Skip garbage
- while (pos < end && (isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
- ++pos;
-
- if (pos == end)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = end;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- while (pos < end && !(isWhitespaceASCII(*pos) || isPunctuationASCII(*pos)))
- ++pos;
-
- token_end = pos;
- splits++;
-
- return true;
- }
-};
-
-class SplitByWhitespaceImpl
-{
-private:
- Pos pos;
- Pos end;
- std::optional max_splits;
- size_t splits;
- bool max_substrings_includes_remaining_string;
-
-public:
- static constexpr auto name = "splitByWhitespace";
- static String getName() { return name; }
-
- static bool isVariadic() { return true; }
- static size_t getNumberOfArguments() { return 0; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- return SplitByNonAlphaImpl::checkArguments(func, arguments);
- }
-
- static constexpr auto strings_argument_position = 0uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
- {
- max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
- max_splits = extractMaxSplits(arguments, 1);
- }
-
- /// Called for each next string.
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- splits = 0;
- }
-
- /// Get the next token, if any, or return false.
- bool get(Pos & token_begin, Pos & token_end)
- {
- /// Skip garbage
- while (pos < end && isWhitespaceASCII(*pos))
- ++pos;
-
- if (pos == end)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = end;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- while (pos < end && !isWhitespaceASCII(*pos))
- ++pos;
-
- token_end = pos;
- splits++;
-
- return true;
- }
-};
-
-class SplitByCharImpl
-{
-private:
- Pos pos;
- Pos end;
- char separator;
- std::optional max_splits;
- size_t splits;
- bool max_substrings_includes_remaining_string;
-
-public:
- static constexpr auto name = "splitByChar";
- static String getName() { return name; }
- static bool isVariadic() { return true; }
- static size_t getNumberOfArguments() { return 0; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- FunctionArgumentDescriptors mandatory_args{
- {"separator", &isString, isColumnConst, "const String"},
- {"s", &isString, nullptr, "String"}
- };
-
- FunctionArgumentDescriptors optional_args{
- {"max_substrings", &isNativeInteger, isColumnConst, "const Number"},
- };
-
- validateFunctionArgumentTypes(func, arguments, mandatory_args, optional_args);
- }
-
- static constexpr auto strings_argument_position = 1uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
- {
- const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
-
- if (!col)
- throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
- "Must be constant string.", arguments[0].column->getName(), getName());
-
- String sep_str = col->getValue();
-
- if (sep_str.size() != 1)
- throw Exception(ErrorCodes::BAD_ARGUMENTS, "Illegal separator for function {}. Must be exactly one byte.", getName());
-
- separator = sep_str[0];
-
- max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
- max_splits = extractMaxSplits(arguments, 2);
- }
-
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- splits = 0;
- }
-
- bool get(Pos & token_begin, Pos & token_end)
- {
- if (!pos)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = nullptr;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- pos = reinterpret_cast(memchr(pos, separator, end - pos));
- if (pos)
- {
- token_end = pos;
- ++pos;
- ++splits;
- }
- else
- token_end = end;
-
- return true;
- }
-};
-
-
-class SplitByStringImpl
-{
-private:
- Pos pos;
- Pos end;
- String separator;
- std::optional max_splits;
- size_t splits;
- bool max_substrings_includes_remaining_string;
-
-public:
- static constexpr auto name = "splitByString";
- static String getName() { return name; }
- static bool isVariadic() { return true; }
- static size_t getNumberOfArguments() { return 0; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- SplitByCharImpl::checkArguments(func, arguments);
- }
-
- static constexpr auto strings_argument_position = 1uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
- {
- const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
-
- if (!col)
- throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
- "Must be constant string.", arguments[0].column->getName(), getName());
-
- separator = col->getValue();
-
- max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
- max_splits = extractMaxSplits(arguments, 2);
- }
-
- /// Called for each next string.
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- splits = 0;
- }
-
- /// Get the next token, if any, or return false.
- bool get(Pos & token_begin, Pos & token_end)
- {
- if (separator.empty())
- {
- if (pos == end)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = end;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- pos += 1;
- token_end = pos;
- ++splits;
- }
- else
- {
- if (!pos)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = nullptr;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- pos = reinterpret_cast(memmem(pos, end - pos, separator.data(), separator.size()));
- if (pos)
- {
- token_end = pos;
- pos += separator.size();
- ++splits;
- }
- else
- token_end = end;
- }
-
- return true;
- }
-};
-
-class SplitByRegexpImpl
-{
-private:
- Regexps::RegexpPtr re;
- OptimizedRegularExpression::MatchVec matches;
-
- Pos pos;
- Pos end;
-
- std::optional max_splits;
- size_t splits;
- bool max_substrings_includes_remaining_string;
-
-public:
- static constexpr auto name = "splitByRegexp";
- static String getName() { return name; }
-
- static bool isVariadic() { return true; }
- static size_t getNumberOfArguments() { return 0; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- SplitByStringImpl::checkArguments(func, arguments);
- }
-
- static constexpr auto strings_argument_position = 1uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
- {
- const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[0].column.get());
-
- if (!col)
- throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
- "Must be constant string.", arguments[0].column->getName(), getName());
-
- if (!col->getValue().empty())
- re = std::make_shared(Regexps::createRegexp(col->getValue()));
-
- max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
- max_splits = extractMaxSplits(arguments, 2);
- }
-
- /// Called for each next string.
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- splits = 0;
- }
-
- /// Get the next token, if any, or return false.
- bool get(Pos & token_begin, Pos & token_end)
- {
- if (!re)
- {
- if (pos == end)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = end;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- pos += 1;
- token_end = pos;
- ++splits;
- }
- else
- {
- if (!pos || pos > end)
- return false;
-
- token_begin = pos;
-
- if (max_splits)
- {
- if (max_substrings_includes_remaining_string)
- {
- if (splits == *max_splits - 1)
- {
- token_end = end;
- pos = nullptr;
- return true;
- }
- }
- else
- if (splits == *max_splits)
- return false;
- }
-
- if (!re->match(pos, end - pos, matches) || !matches[0].length)
- {
- token_end = end;
- pos = end + 1;
- }
- else
- {
- token_end = pos + matches[0].offset;
- pos = token_end + matches[0].length;
- ++splits;
- }
- }
-
- return true;
- }
-};
-
-class ExtractAllImpl
-{
-private:
- Regexps::RegexpPtr re;
- OptimizedRegularExpression::MatchVec matches;
- size_t capture;
-
- Pos pos;
- Pos end;
-public:
- static constexpr auto name = "extractAll";
- static String getName() { return name; }
- static bool isVariadic() { return false; }
- static size_t getNumberOfArguments() { return 2; }
-
- static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
- {
- FunctionArgumentDescriptors mandatory_args{
- {"haystack", &isString, nullptr, "String"},
- {"pattern", &isString, isColumnConst, "const String"}
- };
-
- validateFunctionArgumentTypes(func, arguments, mandatory_args);
- }
-
- static constexpr auto strings_argument_position = 0uz;
-
- void init(const ColumnsWithTypeAndName & arguments, bool /*max_substrings_includes_remaining_string*/)
- {
- const ColumnConst * col = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
-
- if (!col)
- throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal column {} of first argument of function {}. "
- "Must be constant string.", arguments[1].column->getName(), getName());
-
- re = std::make_shared(Regexps::createRegexp(col->getValue()));
- capture = re->getNumberOfSubpatterns() > 0 ? 1 : 0;
-
- matches.resize(capture + 1);
- }
-
- /// Called for each next string.
- void set(Pos pos_, Pos end_)
- {
- pos = pos_;
- end = end_;
- }
-
- /// Get the next token, if any, or return false.
- bool get(Pos & token_begin, Pos & token_end)
- {
- if (!pos || pos > end)
- return false;
-
- if (!re->match(pos, end - pos, matches) || !matches[0].length)
- return false;
-
- if (matches[capture].offset == std::string::npos)
- {
- /// Empty match.
- token_begin = pos;
- token_end = pos;
- }
- else
- {
- token_begin = pos + matches[capture].offset;
- token_end = token_begin + matches[capture].length;
- }
-
- pos += matches[0].offset + matches[0].length;
-
- return true;
- }
-};
-
-/// A function that takes a string, and returns an array of substrings created by some generator.
-template
-class FunctionTokens : public IFunction
-{
-private:
- bool max_substrings_includes_remaining_string;
-
-public:
- static constexpr auto name = Generator::name;
- static FunctionPtr create(ContextPtr context) { return std::make_shared(context); }
-
- explicit FunctionTokens(ContextPtr context)
- {
- const Settings & settings = context->getSettingsRef();
- max_substrings_includes_remaining_string = settings.splitby_max_substrings_includes_remaining_string;
- }
-
- String getName() const override { return name; }
-
- bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
-
- bool isVariadic() const override { return Generator::isVariadic(); }
-
- size_t getNumberOfArguments() const override { return Generator::getNumberOfArguments(); }
-
- DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
- {
- Generator::checkArguments(*this, arguments);
-
- return std::make_shared(std::make_shared());
- }
-
- ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
- {
- Generator generator;
- generator.init(arguments, max_substrings_includes_remaining_string);
-
- const auto & array_argument = arguments[generator.strings_argument_position];
-
- const ColumnString * col_str = checkAndGetColumn(array_argument.column.get());
- const ColumnConst * col_str_const = checkAndGetColumnConstStringOrFixedString(array_argument.column.get());
-
- auto col_res = ColumnArray::create(ColumnString::create());
-
- ColumnString & res_strings = typeid_cast(col_res->getData());
- ColumnString::Chars & res_strings_chars = res_strings.getChars();
- ColumnString::Offsets & res_strings_offsets = res_strings.getOffsets();
-
- ColumnArray::Offsets & res_offsets = col_res->getOffsets();
-
- if (col_str)
- {
- const ColumnString::Chars & src_chars = col_str->getChars();
- const ColumnString::Offsets & src_offsets = col_str->getOffsets();
-
- res_offsets.reserve(src_offsets.size());
- res_strings_offsets.reserve(src_offsets.size() * 5); /// Constant 5 - at random.
- res_strings_chars.reserve(src_chars.size());
-
- Pos token_begin = nullptr;
- Pos token_end = nullptr;
-
- size_t size = src_offsets.size();
- ColumnString::Offset current_src_offset = 0;
- ColumnArray::Offset current_dst_offset = 0;
- ColumnString::Offset current_dst_strings_offset = 0;
- for (size_t i = 0; i < size; ++i)
- {
- Pos pos = reinterpret_cast(&src_chars[current_src_offset]);
- current_src_offset = src_offsets[i];
- Pos end = reinterpret_cast(&src_chars[current_src_offset]) - 1;
-
- generator.set(pos, end);
- size_t j = 0;
- while (generator.get(token_begin, token_end))
- {
- size_t token_size = token_end - token_begin;
-
- res_strings_chars.resize(res_strings_chars.size() + token_size + 1);
- memcpySmallAllowReadWriteOverflow15(&res_strings_chars[current_dst_strings_offset], token_begin, token_size);
- res_strings_chars[current_dst_strings_offset + token_size] = 0;
-
- current_dst_strings_offset += token_size + 1;
- res_strings_offsets.push_back(current_dst_strings_offset);
- ++j;
- }
-
- current_dst_offset += j;
- res_offsets.push_back(current_dst_offset);
- }
-
- return col_res;
- }
- else if (col_str_const)
- {
- String src = col_str_const->getValue();
- Array dst;
-
- generator.set(src.data(), src.data() + src.size());
- Pos token_begin = nullptr;
- Pos token_end = nullptr;
-
- while (generator.get(token_begin, token_end))
- dst.push_back(String(token_begin, token_end - token_begin));
-
- return result_type->createColumnConst(col_str_const->size(), dst);
- }
- else
- throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Illegal columns {}, {} of arguments of function {}",
- array_argument.column->getName(), array_argument.column->getName(), getName());
- }
-};
-
-
-/// Joins an array of type serializable to string into one string via a separator.
-class FunctionArrayStringConcat : public IFunction
-{
-private:
- static void executeInternal(
- const ColumnString::Chars & src_chars,
- const ColumnString::Offsets & src_string_offsets,
- const ColumnArray::Offsets & src_array_offsets,
- const char * delimiter,
- const size_t delimiter_size,
- ColumnString::Chars & dst_chars,
- ColumnString::Offsets & dst_string_offsets,
- const char8_t * null_map)
- {
- size_t size = src_array_offsets.size();
-
- if (!size)
- return;
-
- /// With a small margin - as if the separator goes after the last string of the array.
- dst_chars.resize(
- src_chars.size()
- + delimiter_size * src_string_offsets.size() /// Separators after each string...
- + src_array_offsets.size() /// Zero byte after each joined string
- - src_string_offsets.size()); /// The former zero byte after each string of the array
-
- /// There will be as many strings as there were arrays.
- dst_string_offsets.resize(src_array_offsets.size());
-
- ColumnArray::Offset current_src_array_offset = 0;
-
- ColumnString::Offset current_dst_string_offset = 0;
-
- /// Loop through the array of strings.
- for (size_t i = 0; i < size; ++i)
- {
- bool first_non_null = true;
- /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1.
- for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset)
- {
- if (null_map && null_map[current_src_array_offset]) [[unlikely]]
- continue;
-
- if (!first_non_null)
- {
- memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size);
- current_dst_string_offset += delimiter_size;
- }
- first_non_null = false;
-
- const auto current_src_string_offset = current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] : 0;
- size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1;
-
- memcpySmallAllowReadWriteOverflow15(
- &dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy);
-
- current_dst_string_offset += bytes_to_copy;
- }
-
- dst_chars[current_dst_string_offset] = 0;
- ++current_dst_string_offset;
-
- dst_string_offsets[i] = current_dst_string_offset;
- }
-
- dst_chars.resize(dst_string_offsets.back());
- }
-
- static void executeInternal(
- const ColumnString & col_string,
- const ColumnArray & col_arr,
- const String & delimiter,
- ColumnString & col_res,
- const char8_t * null_map = nullptr)
- {
- executeInternal(
- col_string.getChars(),
- col_string.getOffsets(),
- col_arr.getOffsets(),
- delimiter.data(),
- delimiter.size(),
- col_res.getChars(),
- col_res.getOffsets(),
- null_map);
- }
-
- static ColumnPtr serializeNestedColumn(const ColumnArray & col_arr, const DataTypePtr & nested_type)
- {
- if (isString(nested_type))
- {
- return col_arr.getDataPtr();
- }
- else if (const ColumnNullable * col_nullable = checkAndGetColumn(col_arr.getData());
- col_nullable && isString(col_nullable->getNestedColumn().getDataType()))
- {
- return col_nullable->getNestedColumnPtr();
- }
- else
- {
- ColumnsWithTypeAndName cols;
- cols.emplace_back(col_arr.getDataPtr(), nested_type, "tmp");
- return ConvertImplGenericToString::execute(cols, std::make_shared(), col_arr.size());
- }
- }
-
-public:
- static constexpr auto name = "arrayStringConcat";
- static FunctionPtr create(ContextPtr) { return std::make_shared(); }
-
- String getName() const override
- {
- return name;
- }
-
- bool isVariadic() const override { return true; }
- bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
- size_t getNumberOfArguments() const override { return 0; }
-
- DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override;
-
- ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & result_type, size_t /*input_rows_count*/) const override
- {
- String delimiter;
- if (arguments.size() == 2)
- {
- const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
- if (!col_delim)
- throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant string.", getName());
-
- delimiter = col_delim->getValue();
- }
-
- const auto & nested_type = assert_cast(*arguments[0].type).getNestedType();
- if (const ColumnConst * col_const_arr = checkAndGetColumnConst(arguments[0].column.get());
- col_const_arr && isString(nested_type))
- {
- Array src_arr = col_const_arr->getValue();
- String dst_str;
- bool first_non_null = true;
- for (size_t i = 0, size = src_arr.size(); i < size; ++i)
- {
- if (src_arr[i].isNull())
- continue;
- if (!first_non_null)
- dst_str += delimiter;
- first_non_null = false;
- dst_str += src_arr[i].get();
- }
-
- return result_type->createColumnConst(col_const_arr->size(), dst_str);
- }
-
- ColumnPtr src_column = arguments[0].column->convertToFullColumnIfConst();
- const ColumnArray & col_arr = assert_cast(*src_column.get());
-
- ColumnPtr str_subcolumn = serializeNestedColumn(col_arr, nested_type);
- const ColumnString & col_string = assert_cast(*str_subcolumn.get());
-
- auto col_res = ColumnString::create();
- if (const ColumnNullable * col_nullable = checkAndGetColumn(col_arr.getData()))
- executeInternal(col_string, col_arr, delimiter, *col_res, col_nullable->getNullMapData().data());
- else
- executeInternal(col_string, col_arr, delimiter, *col_res);
- return col_res;
- }
-};
-
-
-using FunctionSplitByAlpha = FunctionTokens;
-using FunctionSplitByNonAlpha = FunctionTokens;
-using FunctionSplitByWhitespace = FunctionTokens;
-using FunctionSplitByChar = FunctionTokens;
-using FunctionSplitByString = FunctionTokens;
-using FunctionSplitByRegexp = FunctionTokens;
-using FunctionExtractAll = FunctionTokens;
-
-}
diff --git a/src/Functions/URL/URLHierarchy.cpp b/src/Functions/URL/URLHierarchy.cpp
index 96b64d3182b..25c6c9ef40b 100644
--- a/src/Functions/URL/URLHierarchy.cpp
+++ b/src/Functions/URL/URLHierarchy.cpp
@@ -1,9 +1,15 @@
#include
-#include
+#include
+
namespace DB
{
+namespace
+{
+
+using Pos = const char *;
+
class URLPathHierarchyImpl
{
private:
@@ -14,7 +20,6 @@ private:
public:
static constexpr auto name = "URLPathHierarchy";
- static String getName() { return name; }
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
@@ -95,9 +100,10 @@ public:
};
-struct NameURLPathHierarchy { static constexpr auto name = "URLPathHierarchy"; };
using FunctionURLPathHierarchy = FunctionTokens;
+}
+
REGISTER_FUNCTION(URLPathHierarchy)
{
factory.registerFunction();
diff --git a/src/Functions/URL/URLPathHierarchy.cpp b/src/Functions/URL/URLPathHierarchy.cpp
index 7fd6601d780..9a60d4cf989 100644
--- a/src/Functions/URL/URLPathHierarchy.cpp
+++ b/src/Functions/URL/URLPathHierarchy.cpp
@@ -1,9 +1,14 @@
#include
-#include
+#include
namespace DB
{
+namespace
+{
+
+using Pos = const char *;
+
class URLHierarchyImpl
{
private:
@@ -13,7 +18,6 @@ private:
public:
static constexpr auto name = "URLHierarchy";
- static String getName() { return name; }
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
@@ -97,9 +101,10 @@ public:
};
-struct NameURLHierarchy { static constexpr auto name = "URLHierarchy"; };
using FunctionURLHierarchy = FunctionTokens;
+}
+
REGISTER_FUNCTION(URLHierarchy)
{
factory.registerFunction();
diff --git a/src/Functions/URL/extractURLParameterNames.cpp b/src/Functions/URL/extractURLParameterNames.cpp
index b792d9140d6..08da148b43e 100644
--- a/src/Functions/URL/extractURLParameterNames.cpp
+++ b/src/Functions/URL/extractURLParameterNames.cpp
@@ -1,9 +1,14 @@
#include
-#include
+#include
namespace DB
{
+namespace
+{
+
+using Pos = const char *;
+
class ExtractURLParameterNamesImpl
{
private:
@@ -13,7 +18,6 @@ private:
public:
static constexpr auto name = "extractURLParameterNames";
- static String getName() { return name; }
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
@@ -80,9 +84,10 @@ public:
}
};
-struct NameExtractURLParameterNames { static constexpr auto name = "extractURLParameterNames"; };
using FunctionExtractURLParameterNames = FunctionTokens;
+}
+
REGISTER_FUNCTION(ExtractURLParameterNames)
{
factory.registerFunction();
diff --git a/src/Functions/URL/extractURLParameters.cpp b/src/Functions/URL/extractURLParameters.cpp
index e1243d8fbcd..939622dd9d1 100644
--- a/src/Functions/URL/extractURLParameters.cpp
+++ b/src/Functions/URL/extractURLParameters.cpp
@@ -1,9 +1,15 @@
#include
-#include
+#include
+
namespace DB
{
+namespace
+{
+
+using Pos = const char *;
+
class ExtractURLParametersImpl
{
private:
@@ -13,7 +19,6 @@ private:
public:
static constexpr auto name = "extractURLParameters";
- static String getName() { return name; }
static bool isVariadic() { return false; }
static size_t getNumberOfArguments() { return 1; }
@@ -88,9 +93,10 @@ public:
}
};
-struct NameExtractURLParameters { static constexpr auto name = "extractURLParameters"; };
using FunctionExtractURLParameters = FunctionTokens;
+}
+
REGISTER_FUNCTION(ExtractURLParameters)
{
factory.registerFunction();
diff --git a/src/Functions/alphaTokens.cpp b/src/Functions/alphaTokens.cpp
new file mode 100644
index 00000000000..35cacdbdbb8
--- /dev/null
+++ b/src/Functions/alphaTokens.cpp
@@ -0,0 +1,104 @@
+
+#include
+#include
+#include
+
+
+namespace DB
+{
+
+/** Functions that split strings into an array of strings or vice versa.
+ *
+ * alphaTokens(s[, max_substrings]) - select from the string subsequence `[a-zA-Z]+`.
+ */
+namespace
+{
+
+using Pos = const char *;
+
+class SplitByAlphaImpl
+{
+private:
+ Pos pos;
+ Pos end;
+ std::optional max_splits;
+ size_t splits;
+ bool max_substrings_includes_remaining_string;
+
+public:
+ static constexpr auto name = "alphaTokens";
+
+ static bool isVariadic() { return true; }
+
+ static size_t getNumberOfArguments() { return 0; }
+
+ static void checkArguments(const IFunction & func, const ColumnsWithTypeAndName & arguments)
+ {
+ checkArgumentsWithOptionalMaxSubstrings(func, arguments);
+ }
+
+ static constexpr auto strings_argument_position = 0uz;
+
+ void init(const ColumnsWithTypeAndName & arguments, bool max_substrings_includes_remaining_string_)
+ {
+ max_substrings_includes_remaining_string = max_substrings_includes_remaining_string_;
+ max_splits = extractMaxSplits(arguments, 1);
+ }
+
+ /// Called for each next string.
+ void set(Pos pos_, Pos end_)
+ {
+ pos = pos_;
+ end = end_;
+ splits = 0;
+ }
+
+ /// Get the next token, if any, or return false.
+ bool get(Pos & token_begin, Pos & token_end)
+ {
+ /// Skip garbage
+ while (pos < end && !isAlphaASCII(*pos))
+ ++pos;
+
+ if (pos == end)
+ return false;
+
+ token_begin = pos;
+
+ if (max_splits)
+ {
+ if (max_substrings_includes_remaining_string)
+ {
+ if (splits == *max_splits - 1)
+ {
+ token_end = end;
+ pos = end;
+ return true;
+ }
+ }
+ else
+ if (splits == *max_splits)
+ return false;
+ }
+
+ while (pos < end && isAlphaASCII(*pos))
+ ++pos;
+
+ token_end = pos;
+ ++splits;
+
+ return true;
+ }
+};
+
+using FunctionSplitByAlpha = FunctionTokens;
+
+}
+
+REGISTER_FUNCTION(SplitByAlpha)
+{
+ factory.registerFunction();
+ factory.registerAlias("splitByAlpha", FunctionSplitByAlpha::name);
+}
+
+}
diff --git a/src/Functions/appendTrailingCharIfAbsent.cpp b/src/Functions/appendTrailingCharIfAbsent.cpp
index 62c0bbd4598..7ff35e599be 100644
--- a/src/Functions/appendTrailingCharIfAbsent.cpp
+++ b/src/Functions/appendTrailingCharIfAbsent.cpp
@@ -4,7 +4,6 @@
#include
#include
#include
-#include
namespace DB
@@ -46,10 +45,10 @@ private:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
- throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[0]->getName(), getName());
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of the first argument of function {}", arguments[0]->getName(), getName());
if (!isString(arguments[1]))
- throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument of function {}", arguments[1]->getName(), getName());
+ throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of the second argument of function {}", arguments[1]->getName(), getName());
return std::make_shared();
}
diff --git a/src/Functions/arrayStringConcat.cpp b/src/Functions/arrayStringConcat.cpp
new file mode 100644
index 00000000000..0194cc4871a
--- /dev/null
+++ b/src/Functions/arrayStringConcat.cpp
@@ -0,0 +1,202 @@
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+
+/** arrayStringConcat(arr)
+ * arrayStringConcat(arr, delimiter)
+ * - join an array of strings into one string via a separator.
+ */
+namespace DB
+{
+
+namespace ErrorCodes
+{
+ extern const int ILLEGAL_COLUMN;
+}
+
+namespace
+{
+
+/// Joins an array of type serializable to string into one string via a separator.
+class FunctionArrayStringConcat : public IFunction
+{
+private:
+ static void executeInternal(
+ const ColumnString::Chars & src_chars,
+ const ColumnString::Offsets & src_string_offsets,
+ const ColumnArray::Offsets & src_array_offsets,
+ const char * delimiter,
+ const size_t delimiter_size,
+ ColumnString::Chars & dst_chars,
+ ColumnString::Offsets & dst_string_offsets,
+ const char8_t * null_map)
+ {
+ size_t size = src_array_offsets.size();
+
+ if (!size)
+ return;
+
+ /// With a small margin - as if the separator goes after the last string of the array.
+ dst_chars.resize(
+ src_chars.size()
+ + delimiter_size * src_string_offsets.size() /// Separators after each string...
+ + src_array_offsets.size() /// Zero byte after each joined string
+ - src_string_offsets.size()); /// The former zero byte after each string of the array
+
+ /// There will be as many strings as there were arrays.
+ dst_string_offsets.resize(src_array_offsets.size());
+
+ ColumnArray::Offset current_src_array_offset = 0;
+
+ ColumnString::Offset current_dst_string_offset = 0;
+
+ /// Loop through the array of strings.
+ for (size_t i = 0; i < size; ++i)
+ {
+ bool first_non_null = true;
+ /// Loop through the rows within the array. /// NOTE You can do everything in one copy, if the separator has a size of 1.
+ for (auto next_src_array_offset = src_array_offsets[i]; current_src_array_offset < next_src_array_offset; ++current_src_array_offset)
+ {
+ if (null_map && null_map[current_src_array_offset]) [[unlikely]]
+ continue;
+
+ if (!first_non_null)
+ {
+ memcpy(&dst_chars[current_dst_string_offset], delimiter, delimiter_size);
+ current_dst_string_offset += delimiter_size;
+ }
+ first_non_null = false;
+
+ const auto current_src_string_offset = current_src_array_offset ? src_string_offsets[current_src_array_offset - 1] : 0;
+ size_t bytes_to_copy = src_string_offsets[current_src_array_offset] - current_src_string_offset - 1;
+
+ memcpySmallAllowReadWriteOverflow15(
+ &dst_chars[current_dst_string_offset], &src_chars[current_src_string_offset], bytes_to_copy);
+
+ current_dst_string_offset += bytes_to_copy;
+ }
+
+ dst_chars[current_dst_string_offset] = 0;
+ ++current_dst_string_offset;
+
+ dst_string_offsets[i] = current_dst_string_offset;
+ }
+
+ dst_chars.resize(dst_string_offsets.back());
+ }
+
+ static void executeInternal(
+ const ColumnString & col_string,
+ const ColumnArray & col_arr,
+ const String & delimiter,
+ ColumnString & col_res,
+ const char8_t * null_map = nullptr)
+ {
+ executeInternal(
+ col_string.getChars(),
+ col_string.getOffsets(),
+ col_arr.getOffsets(),
+ delimiter.data(),
+ delimiter.size(),
+ col_res.getChars(),
+ col_res.getOffsets(),
+ null_map);
+ }
+
+ static ColumnPtr serializeNestedColumn(const ColumnArray & col_arr, const DataTypePtr & nested_type)
+ {
+ DataTypePtr type = nested_type;
+ ColumnPtr column = col_arr.getDataPtr();
+
+ if (type->isNullable())
+ {
+ type = removeNullable(type);
+ column = assert_cast(*column).getNestedColumnPtr();
+ }
+
+ return castColumn({column, type, "tmp"}, std::make_shared());
+ }
+
+public:
+ static constexpr auto name = "arrayStringConcat";
+ static FunctionPtr create(ContextPtr) { return std::make_shared(); }
+
+ String getName() const override
+ {
+ return name;
+ }
+
+ bool isVariadic() const override { return true; }
+ bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+ size_t getNumberOfArguments() const override { return 0; }
+
+ bool useDefaultImplementationForConstants() const override { return true; }
+ ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
+
+ DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+ {
+ FunctionArgumentDescriptors mandatory_args
+ {
+ {"arr", &isArray, nullptr, "Array"},
+ };
+
+ FunctionArgumentDescriptors optional_args
+ {
+ {"separator", &isString, isColumnConst, "const String"},
+ };
+
+ validateFunctionArgumentTypes(*this, arguments, mandatory_args, optional_args);
+
+ return std::make_shared();
+ }
+
+ ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr & /*result_type*/, size_t /*input_rows_count*/) const override
+ {
+ String delimiter;
+ if (arguments.size() == 2)
+ {
+ const ColumnConst * col_delim = checkAndGetColumnConstStringOrFixedString(arguments[1].column.get());
+ if (!col_delim)
+ throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument for function {} must be constant string.", getName());
+
+ delimiter = col_delim->getValue();
+ }
+
+ const auto & nested_type = assert_cast(*arguments[0].type).getNestedType();
+ const ColumnArray & col_arr = assert_cast