From bd0ce5fc0bcaaff9586e9e4549210882cc26aa64 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 27 Jun 2023 16:34:04 +0800 Subject: [PATCH 01/18] wip --- src/Functions/substringIndex.cpp | 245 +++++++++++++++++++++++++++++++ 1 file changed, 245 insertions(+) create mode 100644 src/Functions/substringIndex.cpp diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp new file mode 100644 index 00000000000..67699304320 --- /dev/null +++ b/src/Functions/substringIndex.cpp @@ -0,0 +1,245 @@ +#include +#include +#include +#include +#include +#include +#include "base/find_symbols.h" + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int ILLEGAL_COLUMN; + extern const int ILLEGAL_TYPE_OF_ARGUMENT; + extern const int ZERO_ARRAY_OR_TUPLE_INDEX; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; + extern const int BAD_ARGUMENTS; +} + +namespace +{ + +template +class FunctionSubstringIndex : public IFunction +{ +public: + static constexpr auto name = is_utf8 ? "substringIndexUTF8" : "substringIndex"; + + + static FunctionPtr create(ContextPtr) + { + return std::make_shared(); + } + + String getName() const override + { + return name; + } + + size_t getNumberOfArguments() const override { return 3; } + + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + + bool useDefaultImplementationForConstants() const override { return true; } + + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override + { + if (!isString(arguments[0])) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of first argument of function {}", + arguments[0]->getName(), + getName()); + + if (!isString(arguments[1])) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of second argument of function {}", + arguments[1]->getName(), + getName()); + + if (!isNativeNumber(arguments[2])) + throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of third argument of function {}", + arguments[2]->getName(), getName()); + + return std::make_shared(); + } + + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + { + ColumnPtr column_string = arguments[0].column; + ColumnPtr column_delim = arguments[1].column; + ColumnPtr column_index = arguments[2].column; + + const ColumnConst * column_delim_const = checkAndGetColumnConst(column_delim.get()); + if (!column_delim_const) + throw Exception(ErrorCodes::ILLEGAL_COLUMN , "Second argument to {} must be a constant String", getName()); + + String delim = column_delim_const->getValue(); + if constexpr (!is_utf8) + { + if (delim.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single character", getName()); + } + else + { + // TODO + } + + auto column_res = ColumnString::create(); + ColumnString::Chars & vec_res = column_res->getChars(); + ColumnString::Offsets & offsets_res = column_res->getOffsets(); + + const ColumnConst * column_string_const = checkAndGetColumnConst(column_string.get()); + if (column_string_const) + { + String str = column_string_const->getValue(); + constantVector(str, delim[0], column_index.get(), vec_res, offsets_res); + } + else + { + const auto * col_str = checkAndGetColumn(column_string.get()); + if (!col_str) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument to {} must be a String", getName()); + + bool is_index_const = isColumnConst(*column_index); + if (is_index_const) + { + Int64 index = column_index->getInt(0); + vectorConstant(col_str->getChars(), col_str->getOffsets(), delim[0], index, vec_res, offsets_res); + } + else + vectorVector(col_str->getChars(), col_str->getOffsets(), delim[0], column_index.get(), vec_res, offsets_res); + } + } + +protected: + static void vectorVector( + const ColumnString::Chars & str_data, + const ColumnString::Offsets & str_offsets, + char delim, + const IColumn * index_column, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = str_offsets.size(); + res_data.reserve(str_data.size() / 2); + res_offsets.reserve(rows); + + for (size_t i=0; igetInt(i); + StringRef res_ref = substringIndex(str_ref, index); + appendToResultColumn(res_ref, res_data, res_offsets); + } + } + + static void vectorConstant( + const ColumnString::Chars & str_data, + const ColumnString::Offsets & str_offsets, + char delim, + Int64 index, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = str_offsets.size(); + res_data.reserve(str_data.size() / 2); + res_offsets.reserve(rows); + + for (size_t i = 0; i(str_ref, index); + appendToResultColumn(res_ref, res_data, res_offsets); + } + } + + static void constantVector( + const String & str, + char delim, + const IColumn * index_column, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = index_column->size(); + res_data.reserve(str.size() * rows / 2); + res_offsets.reserve(rows); + + StringRef str_ref{str.data(), str.size()}; + for (size_t i=0; igetInt(i); + StringRef res_ref = substringIndex(str_ref, index); + appendToResultColumn(res_ref, res_data, res_offsets); + } + } + + static void appendToResultColumn( + const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) + { + size_t res_offset = res_data.size(); + res_data.resize(res_offset + res_ref.size + 1); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size); + res_offset += res_ref.size; + res_data[res_offset] = 0; + ++res_offset; + + res_offsets.emplace_back(res_offset); + } + + template + static StringRef substringIndex( + const StringRef & str, + Int64 index) + { + if (index == 0) + return {str.data, 0}; + + if (index > 0) + { + const auto * end = str.data + str.size; + const auto * pos = str.data; + Int64 i = 0; + while (i < index) + { + pos = find_first_symbols(pos, end); + + if (pos != end) + { + ++pos; + ++i; + } + else + return str; + } + return {str.data, static_cast(pos - str.data)}; + } + else + { + const auto * begin = str.data; + const auto * pos = str.data + str.size; + Int64 i = 0; + while (i < index) + { + const auto * next_pos = detail::find_last_symbols_sse2(begin, pos); + + if (next_pos != pos) + { + pos = next_pos; + ++i; + } + else + return str; + } + + return {pos + 1, static_cast(str.data + str.size - pos - 1)}; + } + } +}; +} + +} + From 0de5fcfbee1d4add8c0a350392163e5f46f23f97 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Tue, 27 Jun 2023 18:13:25 +0800 Subject: [PATCH 02/18] finish dev --- src/Functions/substringIndex.cpp | 127 +++++++++++++++++++++++++------ 1 file changed, 102 insertions(+), 25 deletions(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 67699304320..0a5dfd00656 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -1,10 +1,14 @@ #include +#include #include +#include #include #include +#include #include -#include -#include "base/find_symbols.h" +#include +#include +#include namespace DB { @@ -67,7 +71,7 @@ public: return std::make_shared(); } - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { ColumnPtr column_string = arguments[0].column; ColumnPtr column_delim = arguments[1].column; @@ -85,7 +89,8 @@ public: } else { - // TODO + if (UTF8::countCodePoints(reinterpret_cast(delim.data()), delim.size()) != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single UTF-8 character", getName()); } auto column_res = ColumnString::create(); @@ -96,7 +101,7 @@ public: if (column_string_const) { String str = column_string_const->getValue(); - constantVector(str, delim[0], column_index.get(), vec_res, offsets_res); + constantVector(str, delim, column_index.get(), vec_res, offsets_res); } else { @@ -108,10 +113,10 @@ public: if (is_index_const) { Int64 index = column_index->getInt(0); - vectorConstant(col_str->getChars(), col_str->getOffsets(), delim[0], index, vec_res, offsets_res); + vectorConstant(col_str->getChars(), col_str->getOffsets(), delim, index, vec_res, offsets_res); } else - vectorVector(col_str->getChars(), col_str->getOffsets(), delim[0], column_index.get(), vec_res, offsets_res); + vectorVector(col_str->getChars(), col_str->getOffsets(), delim, column_index.get(), vec_res, offsets_res); } } @@ -119,7 +124,7 @@ protected: static void vectorVector( const ColumnString::Chars & str_data, const ColumnString::Offsets & str_offsets, - char delim, + const String & delim, const IColumn * index_column, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) @@ -128,11 +133,15 @@ protected: res_data.reserve(str_data.size() / 2); res_offsets.reserve(rows); - for (size_t i=0; i searcher + = !is_utf8 ? nullptr : std::make_unique(delim); + + for (size_t i = 0; i < rows; ++i) { StringRef str_ref{&str_data[str_offsets[i]], str_offsets[i] - str_offsets[i - 1] - 1}; Int64 index = index_column->getInt(i); - StringRef res_ref = substringIndex(str_ref, index); + StringRef res_ref + = !is_utf8 ? substringIndex(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); appendToResultColumn(res_ref, res_data, res_offsets); } } @@ -140,7 +149,7 @@ protected: static void vectorConstant( const ColumnString::Chars & str_data, const ColumnString::Offsets & str_offsets, - char delim, + const String & delim, Int64 index, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) @@ -149,17 +158,21 @@ protected: res_data.reserve(str_data.size() / 2); res_offsets.reserve(rows); + std::unique_ptr searcher + = !is_utf8 ? nullptr : std::make_unique(delim); + for (size_t i = 0; i(str_ref, index); + StringRef res_ref + = !is_utf8 ? substringIndex(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); appendToResultColumn(res_ref, res_data, res_offsets); } } static void constantVector( const String & str, - char delim, + const String & delim, const IColumn * index_column, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) @@ -168,11 +181,15 @@ protected: res_data.reserve(str.size() * rows / 2); res_offsets.reserve(rows); + std::unique_ptr searcher + = !is_utf8 ? nullptr : std::make_unique(delim); + StringRef str_ref{str.data(), str.size()}; for (size_t i=0; igetInt(i); - StringRef res_ref = substringIndex(str_ref, index); + StringRef res_ref + = !is_utf8 ? substringIndex(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); appendToResultColumn(res_ref, res_data, res_offsets); } } @@ -190,18 +207,68 @@ protected: res_offsets.emplace_back(res_offset); } + static StringRef substringIndexUTF8( + const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index) + { + if (index == 0) + return {str_ref.data, 0}; + + const auto * begin = reinterpret_cast(str_ref.data); + const auto * end = reinterpret_cast(str_ref.data + str_ref.size); + const auto * pos = begin; + if (index > 0) + { + Int64 i = 0; + while (i < index) + { + pos = searcher->search(pos, end - pos); + + if (pos != end) + { + pos += delim.size(); + ++i; + } + else + return str_ref; + } + return {begin, static_cast(pos - begin - delim.size())}; + } + else + { + Int64 total = 0; + while (pos < end && end != (pos = searcher->search(pos, end - pos))) + { + pos += delim.size(); + ++total; + } + + if (total + index < 0) + return str_ref; + + Int64 index_from_left = total + 1 + index; + pos = begin; + Int64 i = 0; + while (pos < end && end != (pos = searcher->search(pos, end - pos)) && i < index_from_left) + { + pos += delim.size(); + ++i; + } + return {pos, static_cast(end - pos)}; + } + } + template static StringRef substringIndex( - const StringRef & str, + const StringRef & str_ref, Int64 index) { if (index == 0) - return {str.data, 0}; + return {str_ref.data, 0}; if (index > 0) { - const auto * end = str.data + str.size; - const auto * pos = str.data; + const auto * end = str_ref.data + str_ref.size; + const auto * pos = str_ref.data; Int64 i = 0; while (i < index) { @@ -213,18 +280,18 @@ protected: ++i; } else - return str; + return str_ref; } - return {str.data, static_cast(pos - str.data)}; + return {str_ref.data, static_cast(pos - str_ref.data - 1)}; } else { - const auto * begin = str.data; - const auto * pos = str.data + str.size; + const auto * begin = str_ref.data; + const auto * pos = str_ref.data + str_ref.size; Int64 i = 0; while (i < index) { - const auto * next_pos = detail::find_last_symbols_sse2(begin, pos); + const auto * next_pos = ::detail::find_last_symbols_sse2(begin, pos); if (next_pos != pos) { @@ -232,14 +299,24 @@ protected: ++i; } else - return str; + return str_ref; } - return {pos + 1, static_cast(str.data + str.size - pos - 1)}; + return {pos + 1, static_cast(str_ref.data + str_ref.size - pos - 1)}; } } }; } + +REGISTER_FUNCTION(SubstringIndex) +{ + factory.registerFunction>(); /// substringIndex + factory.registerFunction>(); /// substringIndexUTF8 + + factory.registerAlias("SUBSTRING_INDEX", "substringIndex", FunctionFactory::CaseInsensitive); +} + + } From ae7a586aea59deb84a7355021b06eb3b35d876f7 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 28 Jun 2023 10:45:52 +0800 Subject: [PATCH 03/18] fix bugs and add uts --- src/Functions/substringIndex.cpp | 497 +++++++++--------- .../02798_substring_index.reference | 155 ++++++ .../0_stateless/02798_substring_index.sql | 93 ++++ 3 files changed, 496 insertions(+), 249 deletions(-) create mode 100644 tests/queries/0_stateless/02798_substring_index.reference create mode 100644 tests/queries/0_stateless/02798_substring_index.sql diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 0a5dfd00656..1fca3bbed14 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -25,287 +25,287 @@ namespace ErrorCodes namespace { -template -class FunctionSubstringIndex : public IFunction -{ -public: - static constexpr auto name = is_utf8 ? "substringIndexUTF8" : "substringIndex"; - - - static FunctionPtr create(ContextPtr) + template + class FunctionSubstringIndex : public IFunction { - return std::make_shared(); - } + public: + static constexpr auto name = is_utf8 ? "substringIndexUTF8" : "substringIndex"; - String getName() const override - { - return name; - } - size_t getNumberOfArguments() const override { return 3; } + static FunctionPtr create(ContextPtr) { return std::make_shared(); } - bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } + String getName() const override { return name; } - bool useDefaultImplementationForConstants() const override { return true; } + size_t getNumberOfArguments() const override { return 3; } - DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override - { - if (!isString(arguments[0])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument of function {}", - arguments[0]->getName(), - getName()); + bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; } - if (!isString(arguments[1])) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of second argument of function {}", - arguments[1]->getName(), - getName()); + bool useDefaultImplementationForConstants() const override { return true; } + ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } - if (!isNativeNumber(arguments[2])) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of third argument of function {}", - arguments[2]->getName(), getName()); - - return std::make_shared(); - } - - ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override - { - ColumnPtr column_string = arguments[0].column; - ColumnPtr column_delim = arguments[1].column; - ColumnPtr column_index = arguments[2].column; - - const ColumnConst * column_delim_const = checkAndGetColumnConst(column_delim.get()); - if (!column_delim_const) - throw Exception(ErrorCodes::ILLEGAL_COLUMN , "Second argument to {} must be a constant String", getName()); - - String delim = column_delim_const->getValue(); - if constexpr (!is_utf8) + DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override { - if (delim.size() != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single character", getName()); - } - else - { - if (UTF8::countCodePoints(reinterpret_cast(delim.data()), delim.size()) != 1) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single UTF-8 character", getName()); + if (!isString(arguments[0])) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of first argument of function {}", + arguments[0]->getName(), + getName()); + + if (!isString(arguments[1])) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of second argument of function {}", + arguments[1]->getName(), + getName()); + + if (!isNativeNumber(arguments[2])) + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, + "Illegal type {} of third argument of function {}", + arguments[2]->getName(), + getName()); + + return std::make_shared(); } - auto column_res = ColumnString::create(); - ColumnString::Chars & vec_res = column_res->getChars(); - ColumnString::Offsets & offsets_res = column_res->getOffsets(); - - const ColumnConst * column_string_const = checkAndGetColumnConst(column_string.get()); - if (column_string_const) + ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t /*input_rows_count*/) const override { - String str = column_string_const->getValue(); - constantVector(str, delim, column_index.get(), vec_res, offsets_res); - } - else - { - const auto * col_str = checkAndGetColumn(column_string.get()); - if (!col_str) - throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument to {} must be a String", getName()); + ColumnPtr column_string = arguments[0].column; + ColumnPtr column_delim = arguments[1].column; + ColumnPtr column_index = arguments[2].column; - bool is_index_const = isColumnConst(*column_index); - if (is_index_const) + const ColumnConst * column_delim_const = checkAndGetColumnConst(column_delim.get()); + if (!column_delim_const) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument to {} must be a constant String", getName()); + + String delim = column_delim_const->getValue(); + if constexpr (!is_utf8) { - Int64 index = column_index->getInt(0); - vectorConstant(col_str->getChars(), col_str->getOffsets(), delim, index, vec_res, offsets_res); + if (delim.size() != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single character", getName()); } else - vectorVector(col_str->getChars(), col_str->getOffsets(), delim, column_index.get(), vec_res, offsets_res); - } - } - -protected: - static void vectorVector( - const ColumnString::Chars & str_data, - const ColumnString::Offsets & str_offsets, - const String & delim, - const IColumn * index_column, - ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) - { - size_t rows = str_offsets.size(); - res_data.reserve(str_data.size() / 2); - res_offsets.reserve(rows); - - std::unique_ptr searcher - = !is_utf8 ? nullptr : std::make_unique(delim); - - for (size_t i = 0; i < rows; ++i) - { - StringRef str_ref{&str_data[str_offsets[i]], str_offsets[i] - str_offsets[i - 1] - 1}; - Int64 index = index_column->getInt(i); - StringRef res_ref - = !is_utf8 ? substringIndex(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); - appendToResultColumn(res_ref, res_data, res_offsets); - } - } - - static void vectorConstant( - const ColumnString::Chars & str_data, - const ColumnString::Offsets & str_offsets, - const String & delim, - Int64 index, - ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) - { - size_t rows = str_offsets.size(); - res_data.reserve(str_data.size() / 2); - res_offsets.reserve(rows); - - std::unique_ptr searcher - = !is_utf8 ? nullptr : std::make_unique(delim); - - for (size_t i = 0; i(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); - appendToResultColumn(res_ref, res_data, res_offsets); - } - } - - static void constantVector( - const String & str, - const String & delim, - const IColumn * index_column, - ColumnString::Chars & res_data, - ColumnString::Offsets & res_offsets) - { - size_t rows = index_column->size(); - res_data.reserve(str.size() * rows / 2); - res_offsets.reserve(rows); - - std::unique_ptr searcher - = !is_utf8 ? nullptr : std::make_unique(delim); - - StringRef str_ref{str.data(), str.size()}; - for (size_t i=0; igetInt(i); - StringRef res_ref - = !is_utf8 ? substringIndex(str_ref, index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); - appendToResultColumn(res_ref, res_data, res_offsets); - } - } - - static void appendToResultColumn( - const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) - { - size_t res_offset = res_data.size(); - res_data.resize(res_offset + res_ref.size + 1); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size); - res_offset += res_ref.size; - res_data[res_offset] = 0; - ++res_offset; - - res_offsets.emplace_back(res_offset); - } - - static StringRef substringIndexUTF8( - const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index) - { - if (index == 0) - return {str_ref.data, 0}; - - const auto * begin = reinterpret_cast(str_ref.data); - const auto * end = reinterpret_cast(str_ref.data + str_ref.size); - const auto * pos = begin; - if (index > 0) - { - Int64 i = 0; - while (i < index) { - pos = searcher->search(pos, end - pos); + if (UTF8::countCodePoints(reinterpret_cast(delim.data()), delim.size()) != 1) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Second argument to {} must be a single UTF-8 character", getName()); + } - if (pos != end) + auto column_res = ColumnString::create(); + ColumnString::Chars & vec_res = column_res->getChars(); + ColumnString::Offsets & offsets_res = column_res->getOffsets(); + + const ColumnConst * column_string_const = checkAndGetColumnConst(column_string.get()); + if (column_string_const) + { + String str = column_string_const->getValue(); + constantVector(str, delim, column_index.get(), vec_res, offsets_res); + } + else + { + const auto * col_str = checkAndGetColumn(column_string.get()); + if (!col_str) + throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument to {} must be a String", getName()); + + bool is_index_const = isColumnConst(*column_index); + if (is_index_const) + { + Int64 index = column_index->getInt(0); + vectorConstant(col_str, delim, index, vec_res, offsets_res); + } + else + vectorVector(col_str, delim, column_index.get(), vec_res, offsets_res); + } + return column_res; + } + + protected: + static void vectorVector( + const ColumnString * str_column, + const String & delim, + const IColumn * index_column, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = str_column->size(); + res_data.reserve(str_column->getChars().size() / 2); + res_offsets.reserve(rows); + + std::unique_ptr searcher + = !is_utf8 ? nullptr : std::make_unique(delim.data(), delim.size()); + + for (size_t i = 0; i < rows; ++i) + { + StringRef str_ref = str_column->getDataAt(i); + Int64 index = index_column->getInt(i); + StringRef res_ref + = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); + appendToResultColumn(res_ref, res_data, res_offsets); + } + } + + static void vectorConstant( + const ColumnString * str_column, + const String & delim, + Int64 index, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = str_column->size(); + res_data.reserve(str_column->getChars().size() / 2); + res_offsets.reserve(rows); + + std::unique_ptr searcher + = !is_utf8 ? nullptr : std::make_unique(delim.data(), delim.size()); + + for (size_t i = 0; i < rows; ++i) + { + StringRef str_ref = str_column->getDataAt(i); + StringRef res_ref + = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); + std::cout << "result:" << res_ref.toString() << std::endl; + appendToResultColumn(res_ref, res_data, res_offsets); + } + } + + static void constantVector( + const String & str, + const String & delim, + const IColumn * index_column, + ColumnString::Chars & res_data, + ColumnString::Offsets & res_offsets) + { + size_t rows = index_column->size(); + res_data.reserve(str.size() * rows / 2); + res_offsets.reserve(rows); + + std::unique_ptr searcher + = !is_utf8 ? nullptr : std::make_unique(delim.data(), delim.size()); + + StringRef str_ref{str.data(), str.size()}; + for (size_t i = 0; i < rows; ++i) + { + Int64 index = index_column->getInt(i); + StringRef res_ref + = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); + appendToResultColumn(res_ref, res_data, res_offsets); + } + } + + static void appendToResultColumn(const StringRef & res_ref, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) + { + size_t res_offset = res_data.size(); + res_data.resize(res_offset + res_ref.size + 1); + memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size); + res_offset += res_ref.size; + res_data[res_offset] = 0; + ++res_offset; + + res_offsets.emplace_back(res_offset); + } + + static StringRef substringIndexUTF8( + const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index) + { + std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl; + + if (index == 0) + return {str_ref.data, 0}; + + const auto * begin = reinterpret_cast(str_ref.data); + const auto * end = reinterpret_cast(str_ref.data + str_ref.size); + const auto * pos = begin; + if (index > 0) + { + Int64 i = 0; + while (i < index) + { + pos = searcher->search(pos, end - pos); + + if (pos != end) + { + pos += delim.size(); + ++i; + } + else + return str_ref; + } + return {begin, static_cast(pos - begin - delim.size())}; + } + else + { + Int64 total = 0; + while (pos < end && end != (pos = searcher->search(pos, end - pos))) + { + pos += delim.size(); + ++total; + } + + if (total + index < 0) + return str_ref; + + Int64 index_from_left = total + 1 + index; + std::cout << "total:" << total << ", index_from_left" << index_from_left << std::endl; + pos = begin; + Int64 i = 0; + while (i < index_from_left && pos < end && end != (pos = searcher->search(pos, end - pos))) { pos += delim.size(); ++i; + std::cout << "pos offset:" << pos - begin << ", total size:" << end - begin << std::endl; } - else - return str_ref; + std::cout << "pos offset:" << pos - begin << ", size:" << end - pos << std::endl; + StringRef res = {pos, static_cast(end - pos)}; + std::cout << "result:" << res.toString() << std::endl; + return res; } - return {begin, static_cast(pos - begin - delim.size())}; } - else + + static StringRef substringIndex(const StringRef & str_ref, char delim, Int64 index) { - Int64 total = 0; - while (pos < end && end != (pos = searcher->search(pos, end - pos))) + std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl; + + if (index == 0) + return {str_ref.data, 0}; + + if (index > 0) { - pos += delim.size(); - ++total; - } - - if (total + index < 0) - return str_ref; - - Int64 index_from_left = total + 1 + index; - pos = begin; - Int64 i = 0; - while (pos < end && end != (pos = searcher->search(pos, end - pos)) && i < index_from_left) - { - pos += delim.size(); - ++i; - } - return {pos, static_cast(end - pos)}; - } - } - - template - static StringRef substringIndex( - const StringRef & str_ref, - Int64 index) - { - if (index == 0) - return {str_ref.data, 0}; - - if (index > 0) - { - const auto * end = str_ref.data + str_ref.size; - const auto * pos = str_ref.data; - Int64 i = 0; - while (i < index) - { - pos = find_first_symbols(pos, end); - - if (pos != end) + const auto * end = str_ref.data + str_ref.size; + const auto * pos = str_ref.data; + Int64 i = 0; + while (i < index) { - ++pos; - ++i; + pos = std::find(pos, end, delim); + if (pos != end) + { + ++pos; + ++i; + } + else + return str_ref; } - else - return str_ref; + return {str_ref.data, static_cast(pos - str_ref.data - 1)}; } - return {str_ref.data, static_cast(pos - str_ref.data - 1)}; - } - else - { - const auto * begin = str_ref.data; - const auto * pos = str_ref.data + str_ref.size; - Int64 i = 0; - while (i < index) + else { - const auto * next_pos = ::detail::find_last_symbols_sse2(begin, pos); - - if (next_pos != pos) + const auto * begin = str_ref.data; + const auto * pos = str_ref.data + str_ref.size; + Int64 i = 0; + while (i + index < 0) { - pos = next_pos; - ++i; - } - else - return str_ref; - } + --pos; + while (pos >= begin && *pos != delim) + --pos; - return {pos + 1, static_cast(str_ref.data + str_ref.size - pos - 1)}; + if (pos >= begin) + ++i; + else + return str_ref; + } + return {pos + 1, static_cast(str_ref.data + str_ref.size - pos - 1)}; + } } - } -}; + }; } @@ -319,4 +319,3 @@ REGISTER_FUNCTION(SubstringIndex) } - diff --git a/tests/queries/0_stateless/02798_substring_index.reference b/tests/queries/0_stateless/02798_substring_index.reference new file mode 100644 index 00000000000..a3084509c12 --- /dev/null +++ b/tests/queries/0_stateless/02798_substring_index.reference @@ -0,0 +1,155 @@ +-- { echoOn } +select substringIndex('www.clickhouse.com', '.', -4); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', -3); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', -2); +clickhouse.com +select substringIndex('www.clickhouse.com', '.', -1); +com +select substringIndex('www.clickhouse.com', '.', 0); + +select substringIndex('www.clickhouse.com', '.', 1); +www +select substringIndex('www.clickhouse.com', '.', 2); +www.clickhouse +select substringIndex('www.clickhouse.com', '.', 3); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', 4); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', -4); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', -3); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', -2); +clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', -1); +com +select substringIndex(materialize('www.clickhouse.com'), '.', 0); + +select substringIndex(materialize('www.clickhouse.com'), '.', 1); +www +select substringIndex(materialize('www.clickhouse.com'), '.', 2); +www.clickhouse +select substringIndex(materialize('www.clickhouse.com'), '.', 3); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', 4); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-4)); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-3)); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-2)); +clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-1)); +com +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(0)); + +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(1)); +www +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(2)); +www.clickhouse +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(3)); +www.clickhouse.com +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(4)); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', materialize(-4)); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', materialize(-3)); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', materialize(-2)); +clickhouse.com +select substringIndex('www.clickhouse.com', '.', materialize(-1)); +com +select substringIndex('www.clickhouse.com', '.', materialize(0)); + +select substringIndex('www.clickhouse.com', '.', materialize(1)); +www +select substringIndex('www.clickhouse.com', '.', materialize(2)); +www.clickhouse +select substringIndex('www.clickhouse.com', '.', materialize(3)); +www.clickhouse.com +select substringIndex('www.clickhouse.com', '.', materialize(4)); +www.clickhouse.com +select SUBSTRING_INDEX('www.clickhouse.com', '.', 2); +www.clickhouse +select substringIndex('www.clickhouse.com', '..', 2); -- { serverError BAD_ARGUMENTS } +select substringIndex('www.clickhouse.com', '', 2); -- { serverError BAD_ARGUMENTS } +select substringIndex('www.clickhouse.com', materialize('.'), 2); -- { serverError ILLEGAL_COLUMN } +select substringIndex('www.clickhouse.com', '.', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +select substringIndexUTF8('富强,民主,文明', ',', -4); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', -3); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', -2); +民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', -1); +文明 +select substringIndexUTF8('富强,民主,文明', ',', 0); + +select substringIndexUTF8('富强,民主,文明', ',', 1); +富强 +select substringIndexUTF8('富强,民主,文明', ',', 2); +富强,民主 +select substringIndexUTF8('富强,民主,文明', ',', 3); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', 4); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -4); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -3); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -2); +民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -1); +文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 0); + +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 1); +富强 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 2); +富强,民主 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 3); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 4); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', materialize(-4)); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', materialize(-3)); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', materialize(-2)); +民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', materialize(-1)); +文明 +select substringIndexUTF8('富强,民主,文明', ',', materialize(0)); + +select substringIndexUTF8('富强,民主,文明', ',', materialize(1)); +富强 +select substringIndexUTF8('富强,民主,文明', ',', materialize(2)); +富强,民主 +select substringIndexUTF8('富强,民主,文明', ',', materialize(3)); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',', materialize(4)); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-4)); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-3)); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-2)); +民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-1)); +文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(0)); + +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(1)); +富强 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(2)); +富强,民主 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(3)); +富强,民主,文明 +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(4)); +富强,民主,文明 +select substringIndexUTF8('富强,民主,文明', ',,', 2); -- { serverError BAD_ARGUMENTS } +select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS } +select substringIndexUTF8('富强,民主,文明', materialize(','), 2); -- { serverError ILLEGAL_COLUMN } +select substringIndexUTF8('富强,民主,文明', ',', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } diff --git a/tests/queries/0_stateless/02798_substring_index.sql b/tests/queries/0_stateless/02798_substring_index.sql new file mode 100644 index 00000000000..520775e8970 --- /dev/null +++ b/tests/queries/0_stateless/02798_substring_index.sql @@ -0,0 +1,93 @@ +-- { echoOn } +select substringIndex('www.clickhouse.com', '.', -4); +select substringIndex('www.clickhouse.com', '.', -3); +select substringIndex('www.clickhouse.com', '.', -2); +select substringIndex('www.clickhouse.com', '.', -1); +select substringIndex('www.clickhouse.com', '.', 0); +select substringIndex('www.clickhouse.com', '.', 1); +select substringIndex('www.clickhouse.com', '.', 2); +select substringIndex('www.clickhouse.com', '.', 3); +select substringIndex('www.clickhouse.com', '.', 4); + +select substringIndex(materialize('www.clickhouse.com'), '.', -4); +select substringIndex(materialize('www.clickhouse.com'), '.', -3); +select substringIndex(materialize('www.clickhouse.com'), '.', -2); +select substringIndex(materialize('www.clickhouse.com'), '.', -1); +select substringIndex(materialize('www.clickhouse.com'), '.', 0); +select substringIndex(materialize('www.clickhouse.com'), '.', 1); +select substringIndex(materialize('www.clickhouse.com'), '.', 2); +select substringIndex(materialize('www.clickhouse.com'), '.', 3); +select substringIndex(materialize('www.clickhouse.com'), '.', 4); + +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-4)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-3)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-2)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(-1)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(0)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(1)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(2)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(3)); +select substringIndex(materialize('www.clickhouse.com'), '.', materialize(4)); + +select substringIndex('www.clickhouse.com', '.', materialize(-4)); +select substringIndex('www.clickhouse.com', '.', materialize(-3)); +select substringIndex('www.clickhouse.com', '.', materialize(-2)); +select substringIndex('www.clickhouse.com', '.', materialize(-1)); +select substringIndex('www.clickhouse.com', '.', materialize(0)); +select substringIndex('www.clickhouse.com', '.', materialize(1)); +select substringIndex('www.clickhouse.com', '.', materialize(2)); +select substringIndex('www.clickhouse.com', '.', materialize(3)); +select substringIndex('www.clickhouse.com', '.', materialize(4)); + +select SUBSTRING_INDEX('www.clickhouse.com', '.', 2); + +select substringIndex('www.clickhouse.com', '..', 2); -- { serverError BAD_ARGUMENTS } +select substringIndex('www.clickhouse.com', '', 2); -- { serverError BAD_ARGUMENTS } +select substringIndex('www.clickhouse.com', materialize('.'), 2); -- { serverError ILLEGAL_COLUMN } +select substringIndex('www.clickhouse.com', '.', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } + +select substringIndexUTF8('富强,民主,文明', ',', -4); +select substringIndexUTF8('富强,民主,文明', ',', -3); +select substringIndexUTF8('富强,民主,文明', ',', -2); +select substringIndexUTF8('富强,民主,文明', ',', -1); +select substringIndexUTF8('富强,民主,文明', ',', 0); +select substringIndexUTF8('富强,民主,文明', ',', 1); +select substringIndexUTF8('富强,民主,文明', ',', 2); +select substringIndexUTF8('富强,民主,文明', ',', 3); +select substringIndexUTF8('富强,民主,文明', ',', 4); + +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -4); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -3); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -2); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', -1); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 0); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 1); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 2); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 3); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', 4); + +select substringIndexUTF8('富强,民主,文明', ',', materialize(-4)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(-3)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(-2)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(-1)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(0)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(1)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(2)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(3)); +select substringIndexUTF8('富强,民主,文明', ',', materialize(4)); + +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-4)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-3)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-2)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(-1)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(0)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(1)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(2)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(3)); +select substringIndexUTF8(materialize('富强,民主,文明'), ',', materialize(4)); + +select substringIndexUTF8('富强,民主,文明', ',,', 2); -- { serverError BAD_ARGUMENTS } +select substringIndexUTF8('富强,民主,文明', '', 2); -- { serverError BAD_ARGUMENTS } +select substringIndexUTF8('富强,民主,文明', materialize(','), 2); -- { serverError ILLEGAL_COLUMN } +select substringIndexUTF8('富强,民主,文明', ',', cast(2 as Int128)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +-- { echoOff } From 70e49cb31c0ff80ffc6c8e6ab5687b24af659ad1 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 28 Jun 2023 11:28:20 +0800 Subject: [PATCH 04/18] add docs --- .../functions/string-functions.md | 36 +++++++++++++++++++ src/Functions/substringIndex.cpp | 16 ++------- ...new_functions_must_be_documented.reference | 1 + 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 5175bbf0615..5197b786884 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -573,6 +573,42 @@ Alias: Like `substring` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. + +## substringIndex(s, delim, index) + +Returns the substring of `s` before `index` occurrences of the delimiter `delim`, as in Spark or MySQL. + +**Syntax** + +```sql +substringIndex(s, delim, index) +``` +Alias: `SUBSTRING_INDEX` + + +**Arguments** + +- s: The string to extract substring from. [String](../../sql-reference/data-types/string.md). +- delim: The character to split. [String](../../sql-reference/data-types/string.md). +- index: The number of occurrences of the delimiter to count before extracting the substring. If index is positive, everything to the left of the final delimiter (counting from the left) is returned. If index is negative, everything to the right of the final delimiter (counting from the right) is returned. [UInt or Int](../data-types/int-uint.md) + +**Example** + +``` sql +SELECT substringIndex('www.clickhouse.com', '.', 2) +``` + +Result: +``` +┌─substringIndex('www.clickhouse.com', '.', 2)─┐ +│ www.clickhouse │ +└──────────────────────────────────────────────┘ +``` + +## substringIndexUTF8(s, delim, index) + +Like `substringIndex` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. + ## appendTrailingCharIfAbsent Appends character `c` to string `s` if `s` is non-empty and does not end with character `c`. diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 1fca3bbed14..fbb20b245f6 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -17,8 +17,6 @@ namespace ErrorCodes { extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_TYPE_OF_ARGUMENT; - extern const int ZERO_ARRAY_OR_TUPLE_INDEX; - extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; extern const int BAD_ARGUMENTS; } @@ -163,7 +161,6 @@ namespace StringRef str_ref = str_column->getDataAt(i); StringRef res_ref = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); - std::cout << "result:" << res_ref.toString() << std::endl; appendToResultColumn(res_ref, res_data, res_offsets); } } @@ -207,8 +204,6 @@ namespace static StringRef substringIndexUTF8( const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index) { - std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl; - if (index == 0) return {str_ref.data, 0}; @@ -244,27 +239,20 @@ namespace if (total + index < 0) return str_ref; - Int64 index_from_left = total + 1 + index; - std::cout << "total:" << total << ", index_from_left" << index_from_left << std::endl; pos = begin; Int64 i = 0; + Int64 index_from_left = total + 1 + index; while (i < index_from_left && pos < end && end != (pos = searcher->search(pos, end - pos))) { pos += delim.size(); ++i; - std::cout << "pos offset:" << pos - begin << ", total size:" << end - begin << std::endl; } - std::cout << "pos offset:" << pos - begin << ", size:" << end - pos << std::endl; - StringRef res = {pos, static_cast(end - pos)}; - std::cout << "result:" << res.toString() << std::endl; - return res; + return {pos, static_cast(end - pos)}; } } static StringRef substringIndex(const StringRef & str_ref, char delim, Int64 index) { - std::cout << "str:" << str_ref.toString() << ", delim" << delim << ",index:" << index << std::endl; - if (index == 0) return {str_ref.data, 0}; diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index b5c133988e6..6c904d6fc05 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -666,6 +666,7 @@ startsWith subBitmap substring substringUTF8 +substringIndex subtractDays subtractHours subtractMicroseconds From 3f73d3f48aa679dc689dea6e49594752461e4d8b Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 28 Jun 2023 11:33:39 +0800 Subject: [PATCH 05/18] fix failed check --- .../02415_all_new_functions_must_be_documented.reference | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference index 6c904d6fc05..a2621949d0d 100644 --- a/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference +++ b/tests/queries/0_stateless/02415_all_new_functions_must_be_documented.reference @@ -665,8 +665,9 @@ sqrt startsWith subBitmap substring -substringUTF8 substringIndex +substringIndexUTF8 +substringUTF8 subtractDays subtractHours subtractMicroseconds From 375f7abfeba866ae7956e58e9bd1bf364b972ea5 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 28 Jun 2023 12:27:59 +0800 Subject: [PATCH 06/18] fix spelling --- utils/check-style/aspell-ignore/en/aspell-dict.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/check-style/aspell-ignore/en/aspell-dict.txt b/utils/check-style/aspell-ignore/en/aspell-dict.txt index 00d047121e6..9af48417250 100644 --- a/utils/check-style/aspell-ignore/en/aspell-dict.txt +++ b/utils/check-style/aspell-ignore/en/aspell-dict.txt @@ -210,6 +210,7 @@ Decrypted Deduplicate Deduplication DelayedInserts +delim DeliveryTag DeltaLake Denormalize @@ -834,6 +835,8 @@ Subexpression Submodules Subqueries Substrings +substringIndex +substringIndexUTF SummingMergeTree SuperSet Superset From 98966796d0e003d618aade919f7f3e52788ce7e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:18:47 +0800 Subject: [PATCH 07/18] Update docs/en/sql-reference/functions/string-functions.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/en/sql-reference/functions/string-functions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 5197b786884..f6b629f1179 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -574,9 +574,9 @@ Alias: Like `substring` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. -## substringIndex(s, delim, index) +## substringIndex(s, delim, count) -Returns the substring of `s` before `index` occurrences of the delimiter `delim`, as in Spark or MySQL. +Returns the substring of `s` before `count` occurrences of the delimiter `delim`, as in Spark or MySQL. **Syntax** From e2236384d1795ac8f95cb1281b7e5199f8844e8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:18:54 +0800 Subject: [PATCH 08/18] Update docs/en/sql-reference/functions/string-functions.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/en/sql-reference/functions/string-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index f6b629f1179..f3bcc99d83d 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -605,7 +605,7 @@ Result: └──────────────────────────────────────────────┘ ``` -## substringIndexUTF8(s, delim, index) +## substringIndexUTF8(s, delim, count) Like `substringIndex` but for Unicode code points. Assumes that the string contains valid UTF-8 encoded text. If this assumption is violated, no exception is thrown and the result is undefined. From e9bac152e1a5f08845c2d40e608ed293bd5c0384 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:19:04 +0800 Subject: [PATCH 09/18] Update docs/en/sql-reference/functions/string-functions.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/en/sql-reference/functions/string-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index f3bcc99d83d..3ed60434834 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -581,7 +581,7 @@ Returns the substring of `s` before `count` occurrences of the delimiter `delim` **Syntax** ```sql -substringIndex(s, delim, index) +substringIndex(s, delim, count) ``` Alias: `SUBSTRING_INDEX` From 40ded2eca001ecc145358d4ab4c3a5e43738d2e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:19:13 +0800 Subject: [PATCH 10/18] Update docs/en/sql-reference/functions/string-functions.md Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- docs/en/sql-reference/functions/string-functions.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/sql-reference/functions/string-functions.md b/docs/en/sql-reference/functions/string-functions.md index 3ed60434834..12aa8d2c076 100644 --- a/docs/en/sql-reference/functions/string-functions.md +++ b/docs/en/sql-reference/functions/string-functions.md @@ -590,7 +590,7 @@ Alias: `SUBSTRING_INDEX` - s: The string to extract substring from. [String](../../sql-reference/data-types/string.md). - delim: The character to split. [String](../../sql-reference/data-types/string.md). -- index: The number of occurrences of the delimiter to count before extracting the substring. If index is positive, everything to the left of the final delimiter (counting from the left) is returned. If index is negative, everything to the right of the final delimiter (counting from the right) is returned. [UInt or Int](../data-types/int-uint.md) +- count: The number of occurrences of the delimiter to count before extracting the substring. If count is positive, everything to the left of the final delimiter (counting from the left) is returned. If count is negative, everything to the right of the final delimiter (counting from the right) is returned. [UInt or Int](../data-types/int-uint.md) **Example** From f049914c2f47d3e202a936464f4fd04390c1669f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:19:19 +0800 Subject: [PATCH 11/18] Update src/Functions/substringIndex.cpp Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Functions/substringIndex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index fbb20b245f6..34d3ab4b3fb 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -57,7 +57,7 @@ namespace arguments[1]->getName(), getName()); - if (!isNativeNumber(arguments[2])) + if (!isNativeInteger(arguments[2])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of third argument of function {}", From 21ff69772caf01c03512c8076bcf0e95e7805588 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:19:28 +0800 Subject: [PATCH 12/18] Update src/Functions/substringIndex.cpp Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Functions/substringIndex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 34d3ab4b3fb..fb74936b0bc 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -46,7 +46,7 @@ namespace if (!isString(arguments[0])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of first argument of function {}", + "Illegal type {} of first argument of function {}, String expected", arguments[0]->getName(), getName()); From a005b5d0c8aaefb8d1c4b74fffefe464e882d329 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:19:39 +0800 Subject: [PATCH 13/18] Update src/Functions/substringIndex.cpp Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Functions/substringIndex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index fb74936b0bc..653ef9e509a 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -53,7 +53,7 @@ namespace if (!isString(arguments[1])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of second argument of function {}", + "Illegal type {} of second argument of function {}, String expected", arguments[1]->getName(), getName()); From a35476ee13573b37ad0ae667c3c4b2405f681d01 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:19:48 +0800 Subject: [PATCH 14/18] Update src/Functions/substringIndex.cpp Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Functions/substringIndex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 653ef9e509a..963420a4fee 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -60,7 +60,7 @@ namespace if (!isNativeInteger(arguments[2])) throw Exception( ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Illegal type {} of third argument of function {}", + "Illegal type {} of third argument of function {}, Integer expected", arguments[2]->getName(), getName()); From 9e34227fe96a2c37d8895663c7fecdd3688037fd Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 29 Jun 2023 10:39:20 +0800 Subject: [PATCH 15/18] change as requested --- src/Functions/substringIndex.cpp | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 963420a4fee..903edfe5031 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -136,8 +136,13 @@ namespace { StringRef str_ref = str_column->getDataAt(i); Int64 index = index_column->getInt(i); - StringRef res_ref - = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); + + StringRef res_ref; + if constexpr (!is_utf8) + res_ref = substringIndex(str_ref, delim[0], index); + else + res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, index); + appendToResultColumn(res_ref, res_data, res_offsets); } } @@ -159,8 +164,13 @@ namespace for (size_t i = 0; i < rows; ++i) { StringRef str_ref = str_column->getDataAt(i); - StringRef res_ref - = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); + + StringRef res_ref; + if constexpr (!is_utf8) + res_ref = substringIndex(str_ref, delim[0], index); + else + res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, index); + appendToResultColumn(res_ref, res_data, res_offsets); } } @@ -183,8 +193,13 @@ namespace for (size_t i = 0; i < rows; ++i) { Int64 index = index_column->getInt(i); - StringRef res_ref - = !is_utf8 ? substringIndex(str_ref, delim[0], index) : substringIndexUTF8(searcher.get(), str_ref, delim, index); + + StringRef res_ref; + if constexpr (!is_utf8) + res_ref = substringIndex(str_ref, delim[0], index); + else + res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, index); + appendToResultColumn(res_ref, res_data, res_offsets); } } From 3a01a859d9aaef5fc6bba54cc0a40c7c30c4a23f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9D=8E=E6=89=AC?= <654010905@qq.com> Date: Thu, 29 Jun 2023 10:39:25 +0800 Subject: [PATCH 16/18] Update src/Functions/substringIndex.cpp Co-authored-by: Yakov Olkhovskiy <99031427+yakov-olkhovskiy@users.noreply.github.com> --- src/Functions/substringIndex.cpp | 46 +++++++++----------------------- 1 file changed, 12 insertions(+), 34 deletions(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index 963420a4fee..41b46af284a 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -256,41 +256,19 @@ namespace if (index == 0) return {str_ref.data, 0}; - if (index > 0) - { - const auto * end = str_ref.data + str_ref.size; - const auto * pos = str_ref.data; - Int64 i = 0; - while (i < index) - { - pos = std::find(pos, end, delim); - if (pos != end) - { - ++pos; - ++i; - } - else - return str_ref; - } - return {str_ref.data, static_cast(pos - str_ref.data - 1)}; - } - else - { - const auto * begin = str_ref.data; - const auto * pos = str_ref.data + str_ref.size; - Int64 i = 0; - while (i + index < 0) - { - --pos; - while (pos >= begin && *pos != delim) - --pos; + const auto pos = index > 0 ? str_ref.data : str_ref.data + str_ref.size - 1; + const auto end = index > 0 ? str_ref.data + str_ref.size : str_ref.data - 1; + int d = index > 0 ? 1 : -1; - if (pos >= begin) - ++i; - else - return str_ref; - } - return {pos + 1, static_cast(str_ref.data + str_ref.size - pos - 1)}; + for (; index; pos += d) + { + if (pos == end) + return str_ref; + if (*pos == delim) + index -= d; + } + pos -= d; + return {d > 0 ? str_ref.data : pos + 1, static_cast(d > 0 ? pos - str_ref.data : str_ref.data + str_ref.size - pos - 1)} ; } } }; From 95a9270b747322bb376dbfacde8aa58ce0835930 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Thu, 29 Jun 2023 10:54:40 +0800 Subject: [PATCH 17/18] change as request --- src/Functions/substringIndex.cpp | 70 ++++++++++++++++---------------- 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index ade8d0ce504..d1791c9696b 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -71,7 +71,7 @@ namespace { ColumnPtr column_string = arguments[0].column; ColumnPtr column_delim = arguments[1].column; - ColumnPtr column_index = arguments[2].column; + ColumnPtr column_count = arguments[2].column; const ColumnConst * column_delim_const = checkAndGetColumnConst(column_delim.get()); if (!column_delim_const) @@ -97,7 +97,7 @@ namespace if (column_string_const) { String str = column_string_const->getValue(); - constantVector(str, delim, column_index.get(), vec_res, offsets_res); + constantVector(str, delim, column_count.get(), vec_res, offsets_res); } else { @@ -105,14 +105,14 @@ namespace if (!col_str) throw Exception(ErrorCodes::ILLEGAL_COLUMN, "First argument to {} must be a String", getName()); - bool is_index_const = isColumnConst(*column_index); - if (is_index_const) + bool is_count_const = isColumnConst(*column_count); + if (is_count_const) { - Int64 index = column_index->getInt(0); - vectorConstant(col_str, delim, index, vec_res, offsets_res); + Int64 count = column_count->getInt(0); + vectorConstant(col_str, delim, count, vec_res, offsets_res); } else - vectorVector(col_str, delim, column_index.get(), vec_res, offsets_res); + vectorVector(col_str, delim, column_count.get(), vec_res, offsets_res); } return column_res; } @@ -121,7 +121,7 @@ namespace static void vectorVector( const ColumnString * str_column, const String & delim, - const IColumn * index_column, + const IColumn * count_column, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { @@ -135,13 +135,13 @@ namespace for (size_t i = 0; i < rows; ++i) { StringRef str_ref = str_column->getDataAt(i); - Int64 index = index_column->getInt(i); + Int64 count = count_column->getInt(i); StringRef res_ref; if constexpr (!is_utf8) - res_ref = substringIndex(str_ref, delim[0], index); + res_ref = substringIndex(str_ref, delim[0], count); else - res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, index); + res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, count); appendToResultColumn(res_ref, res_data, res_offsets); } @@ -150,7 +150,7 @@ namespace static void vectorConstant( const ColumnString * str_column, const String & delim, - Int64 index, + Int64 count, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { @@ -167,9 +167,9 @@ namespace StringRef res_ref; if constexpr (!is_utf8) - res_ref = substringIndex(str_ref, delim[0], index); + res_ref = substringIndex(str_ref, delim[0], count); else - res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, index); + res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, count); appendToResultColumn(res_ref, res_data, res_offsets); } @@ -178,11 +178,11 @@ namespace static void constantVector( const String & str, const String & delim, - const IColumn * index_column, + const IColumn * count_column, ColumnString::Chars & res_data, ColumnString::Offsets & res_offsets) { - size_t rows = index_column->size(); + size_t rows = count_column->size(); res_data.reserve(str.size() * rows / 2); res_offsets.reserve(rows); @@ -192,13 +192,13 @@ namespace StringRef str_ref{str.data(), str.size()}; for (size_t i = 0; i < rows; ++i) { - Int64 index = index_column->getInt(i); + Int64 count = count_column->getInt(i); StringRef res_ref; if constexpr (!is_utf8) - res_ref = substringIndex(str_ref, delim[0], index); + res_ref = substringIndex(str_ref, delim[0], count); else - res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, index); + res_ref = substringIndexUTF8(searcher.get(), str_ref, delim, count); appendToResultColumn(res_ref, res_data, res_offsets); } @@ -217,18 +217,18 @@ namespace } static StringRef substringIndexUTF8( - const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 index) + const PositionCaseSensitiveUTF8::SearcherInBigHaystack * searcher, const StringRef & str_ref, const String & delim, Int64 count) { - if (index == 0) + if (count == 0) return {str_ref.data, 0}; const auto * begin = reinterpret_cast(str_ref.data); const auto * end = reinterpret_cast(str_ref.data + str_ref.size); const auto * pos = begin; - if (index > 0) + if (count > 0) { Int64 i = 0; - while (i < index) + while (i < count) { pos = searcher->search(pos, end - pos); @@ -251,13 +251,13 @@ namespace ++total; } - if (total + index < 0) + if (total + count < 0) return str_ref; pos = begin; Int64 i = 0; - Int64 index_from_left = total + 1 + index; - while (i < index_from_left && pos < end && end != (pos = searcher->search(pos, end - pos))) + Int64 count_from_left = total + 1 + count; + while (i < count_from_left && pos < end && end != (pos = searcher->search(pos, end - pos))) { pos += delim.size(); ++i; @@ -266,25 +266,25 @@ namespace } } - static StringRef substringIndex(const StringRef & str_ref, char delim, Int64 index) + static StringRef substringIndex(const StringRef & str_ref, char delim, Int64 count) { - if (index == 0) + if (count == 0) return {str_ref.data, 0}; - const auto pos = index > 0 ? str_ref.data : str_ref.data + str_ref.size - 1; - const auto end = index > 0 ? str_ref.data + str_ref.size : str_ref.data - 1; - int d = index > 0 ? 1 : -1; + const auto * pos = count > 0 ? str_ref.data : str_ref.data + str_ref.size - 1; + const auto * end = count > 0 ? str_ref.data + str_ref.size : str_ref.data - 1; + int d = count > 0 ? 1 : -1; - for (; index; pos += d) + for (; count; pos += d) { if (pos == end) return str_ref; if (*pos == delim) - index -= d; + count -= d; } pos -= d; - return {d > 0 ? str_ref.data : pos + 1, static_cast(d > 0 ? pos - str_ref.data : str_ref.data + str_ref.size - pos - 1)} ; - } + return { + d > 0 ? str_ref.data : pos + 1, static_cast(d > 0 ? pos - str_ref.data : str_ref.data + str_ref.size - pos - 1)}; } }; } From d8a66a81233441676fdd8f0c786060c2b1aacd56 Mon Sep 17 00:00:00 2001 From: taiyang-li <654010905@qq.com> Date: Wed, 5 Jul 2023 17:49:01 +0800 Subject: [PATCH 18/18] fix asan error --- src/Functions/substringIndex.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Functions/substringIndex.cpp b/src/Functions/substringIndex.cpp index d1791c9696b..5f3f054b624 100644 --- a/src/Functions/substringIndex.cpp +++ b/src/Functions/substringIndex.cpp @@ -208,7 +208,7 @@ namespace { size_t res_offset = res_data.size(); res_data.resize(res_offset + res_ref.size + 1); - memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], res_ref.data, res_ref.size); + memcpy(&res_data[res_offset], res_ref.data, res_ref.size); res_offset += res_ref.size; res_data[res_offset] = 0; ++res_offset;