#include "FunctionsStringSearch.h" #include #include #include #include #include #include #include #include #include #include #include namespace DB { /** Implementation details for functions of 'position' family depending on ASCII/UTF8 and case sensitiveness. */ struct PositionCaseSensitiveASCII { /// For searching single substring inside big-enough contiguous chunk of data. Coluld have slightly expensive initialization. using SearcherInBigHaystack = Volnitsky; /// For search many substrings in one string using MultiSearcherInBigHaystack = MultiVolnitsky; /// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization. using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher; static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) { return SearcherInBigHaystack(needle_data, needle_size, haystack_size_hint); } static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size) { return SearcherInSmallHaystack(needle_data, needle_size); } static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } /// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8). static size_t countChars(const char * begin, const char * end) { return end - begin; } /// Convert string to lowercase. Only for case-insensitive search. /// Implementation is permitted to be inefficient because it is called for single string. static void toLowerIfNeed(std::string &) {} }; struct PositionCaseInsensitiveASCII { /// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it. using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher; using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive; using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher; static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/) { return SearcherInBigHaystack(needle_data, needle_size); } static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size) { return SearcherInSmallHaystack(needle_data, needle_size); } static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } static size_t countChars(const char * begin, const char * end) { return end - begin; } static void toLowerIfNeed(std::string & s) { std::transform(std::begin(s), std::end(s), std::begin(s), tolower); } }; struct PositionCaseSensitiveUTF8 { using SearcherInBigHaystack = VolnitskyUTF8; using MultiSearcherInBigHaystack = MultiVolnitskyUTF8; using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher; static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) { return SearcherInBigHaystack(needle_data, needle_size, haystack_size_hint); } static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size) { return SearcherInSmallHaystack(needle_data, needle_size); } static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } static size_t countChars(const char * begin, const char * end) { size_t res = 0; for (auto it = begin; it != end; ++it) if (!UTF8::isContinuationOctet(static_cast(*it))) ++res; return res; } static void toLowerIfNeed(std::string &) {} }; struct PositionCaseInsensitiveUTF8 { using SearcherInBigHaystack = VolnitskyCaseInsensitiveUTF8; using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitiveUTF8; using SearcherInSmallHaystack = UTF8CaseInsensitiveStringSearcher; /// TODO Very suboptimal. static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) { return SearcherInBigHaystack(needle_data, needle_size, haystack_size_hint); } static SearcherInSmallHaystack createSearcherInSmallHaystack(const char * needle_data, size_t needle_size) { return SearcherInSmallHaystack(needle_data, needle_size); } static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector & needles) { return MultiSearcherInBigHaystack(needles); } static size_t countChars(const char * begin, const char * end) { size_t res = 0; for (auto it = begin; it != end; ++it) if (!UTF8::isContinuationOctet(static_cast(*it))) ++res; return res; } static void toLowerIfNeed(std::string & s) { Poco::UTF8::toLowerInPlace(s); } }; template struct PositionImpl { using ResultType = UInt64; /// Find one substring in many strings. static void vector_constant( const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray & res) { const UInt8 * begin = data.data(); const UInt8 * pos = begin; const UInt8 * end = pos + data.size(); /// Current index in the array of strings. size_t i = 0; typename Impl::SearcherInBigHaystack searcher = Impl::createSearcherInBigHaystack(needle.data(), needle.size(), end - pos); /// We will search for the next occurrence in all strings at once. while (pos < end && end != (pos = searcher.search(pos, end - pos))) { /// Determine which index it refers to. while (begin + offsets[i] <= pos) { res[i] = 0; ++i; } /// We check that the entry does not pass through the boundaries of strings. if (pos + needle.size() < begin + offsets[i]) res[i] = 1 + Impl::countChars(reinterpret_cast(begin + offsets[i - 1]), reinterpret_cast(pos)); else res[i] = 0; pos = begin + offsets[i]; ++i; } if (i < res.size()) memset(&res[i], 0, (res.size() - i) * sizeof(res[0])); } /// Search for substring in string. static void constant_constant(std::string data, std::string needle, UInt64 & res) { Impl::toLowerIfNeed(data); Impl::toLowerIfNeed(needle); res = data.find(needle); if (res == std::string::npos) res = 0; else res = 1 + Impl::countChars(data.data(), data.data() + res); } /// Search each time for a different single substring inside each time different string. static void vector_vector( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, PaddedPODArray & res) { ColumnString::Offset prev_haystack_offset = 0; ColumnString::Offset prev_needle_offset = 0; size_t size = haystack_offsets.size(); for (size_t i = 0; i < size; ++i) { size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; size_t haystack_size = haystack_offsets[i] - prev_haystack_offset - 1; if (0 == needle_size) { /// An empty string is always at the very beginning of `haystack`. res[i] = 1; } else { /// It is assumed that the StringSearcher is not very difficult to initialize. typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack( reinterpret_cast(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end /// searcher returns a pointer to the found substring or to the end of `haystack`. size_t pos = searcher.search(&haystack_data[prev_haystack_offset], &haystack_data[haystack_offsets[i] - 1]) - &haystack_data[prev_haystack_offset]; if (pos != haystack_size) { res[i] = 1 + Impl::countChars( reinterpret_cast(&haystack_data[prev_haystack_offset]), reinterpret_cast(&haystack_data[prev_haystack_offset + pos])); } else res[i] = 0; } prev_haystack_offset = haystack_offsets[i]; prev_needle_offset = needle_offsets[i]; } } /// Find many substrings in single string. static void constant_vector( const String & haystack, const ColumnString::Chars & needle_data, const ColumnString::Offsets & needle_offsets, PaddedPODArray & res) { // NOTE You could use haystack indexing. But this is a rare case. ColumnString::Offset prev_needle_offset = 0; size_t size = needle_offsets.size(); for (size_t i = 0; i < size; ++i) { size_t needle_size = needle_offsets[i] - prev_needle_offset - 1; if (0 == needle_size) { res[i] = 1; } else { typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack( reinterpret_cast(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1); size_t pos = searcher.search( reinterpret_cast(haystack.data()), reinterpret_cast(haystack.data()) + haystack.size()) - reinterpret_cast(haystack.data()); if (pos != haystack.size()) { res[i] = 1 + Impl::countChars(haystack.data(), haystack.data() + pos); } else res[i] = 0; } prev_needle_offset = needle_offsets[i]; } } }; template struct MultiSearchAllPositionsImpl { using ResultType = UInt64; static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res) { auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 { return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); }; auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); const size_t needles_size = needles.size(); /// Something can be uninitialized after the search itself std::fill(res.begin(), res.end(), 0); while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; searcher.searchOneAll(haystack, haystack_end, res.data() + from, res_callback); prev_offset = haystack_offsets[j]; } } } }; template struct MultiSearchImpl { using ResultType = UInt8; static constexpr bool is_using_hyperscan = false; static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res) { auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); size_t iteration = 0; while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; for (size_t j = 0; j < haystack_string_size; ++j) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; if (iteration == 0 || !res[j]) res[j] = searcher.searchOne(haystack, haystack_end); prev_offset = haystack_offsets[j]; } ++iteration; } } }; template struct MultiSearchFirstPositionImpl { using ResultType = UInt64; static constexpr bool is_using_hyperscan = false; static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res) { auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 { return 1 + Impl::countChars(reinterpret_cast(start), reinterpret_cast(end)); }; auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); size_t iteration = 0; while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; for (size_t j = 0; j < haystack_string_size; ++j) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; if (iteration == 0 || res[j] == 0) res[j] = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback); else { UInt64 result = searcher.searchOneFirstPosition(haystack, haystack_end, res_callback); if (result != 0) res[j] = std::min(result, res[j]); } prev_offset = haystack_offsets[j]; } ++iteration; } } }; template struct MultiSearchFirstIndexImpl { using ResultType = UInt64; static constexpr bool is_using_hyperscan = false; static void vector_constant( const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets, const std::vector & needles, PaddedPODArray & res) { auto searcher = Impl::createMultiSearcherInBigHaystack(needles); const size_t haystack_string_size = haystack_offsets.size(); size_t iteration = 0; while (searcher.hasMoreToSearch()) { size_t prev_offset = 0; for (size_t j = 0; j < haystack_string_size; ++j) { const auto * haystack = &haystack_data[prev_offset]; const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1; /// hasMoreToSearch traverse needles in increasing order if (iteration == 0 || res[j] == 0) res[j] = searcher.searchOneFirstIndex(haystack, haystack_end); prev_offset = haystack_offsets[j]; } ++iteration; } } }; struct NamePosition { static constexpr auto name = "position"; }; struct NamePositionUTF8 { static constexpr auto name = "positionUTF8"; }; struct NamePositionCaseInsensitive { static constexpr auto name = "positionCaseInsensitive"; }; struct NamePositionCaseInsensitiveUTF8 { static constexpr auto name = "positionCaseInsensitiveUTF8"; }; struct NameMultiSearchAllPositions { static constexpr auto name = "multiSearchAllPositions"; }; struct NameMultiSearchAllPositionsUTF8 { static constexpr auto name = "multiSearchAllPositionsUTF8"; }; struct NameMultiSearchAllPositionsCaseInsensitive { static constexpr auto name = "multiSearchAllPositionsCaseInsensitive"; }; struct NameMultiSearchAllPositionsCaseInsensitiveUTF8 { static constexpr auto name = "multiSearchAllPositionsCaseInsensitiveUTF8"; }; struct NameMultiSearchAny { static constexpr auto name = "multiSearchAny"; }; struct NameMultiSearchAnyUTF8 { static constexpr auto name = "multiSearchAnyUTF8"; }; struct NameMultiSearchAnyCaseInsensitive { static constexpr auto name = "multiSearchAnyCaseInsensitive"; }; struct NameMultiSearchAnyCaseInsensitiveUTF8 { static constexpr auto name = "multiSearchAnyCaseInsensitiveUTF8"; }; struct NameMultiSearchFirstIndex { static constexpr auto name = "multiSearchFirstIndex"; }; struct NameMultiSearchFirstIndexUTF8 { static constexpr auto name = "multiSearchFirstIndexUTF8"; }; struct NameMultiSearchFirstIndexCaseInsensitive { static constexpr auto name = "multiSearchFirstIndexCaseInsensitive"; }; struct NameMultiSearchFirstIndexCaseInsensitiveUTF8 { static constexpr auto name = "multiSearchFirstIndexCaseInsensitiveUTF8"; }; struct NameMultiSearchFirstPosition { static constexpr auto name = "multiSearchFirstPosition"; }; struct NameMultiSearchFirstPositionUTF8 { static constexpr auto name = "multiSearchFirstPositionUTF8"; }; struct NameMultiSearchFirstPositionCaseInsensitive { static constexpr auto name = "multiSearchFirstPositionCaseInsensitive"; }; struct NameMultiSearchFirstPositionCaseInsensitiveUTF8 { static constexpr auto name = "multiSearchFirstPositionCaseInsensitiveUTF8"; }; using FunctionPosition = FunctionsStringSearch, NamePosition>; using FunctionPositionUTF8 = FunctionsStringSearch, NamePositionUTF8>; using FunctionPositionCaseInsensitive = FunctionsStringSearch, NamePositionCaseInsensitive>; using FunctionPositionCaseInsensitiveUTF8 = FunctionsStringSearch, NamePositionCaseInsensitiveUTF8>; using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition, NameMultiSearchAllPositions>; using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition, NameMultiSearchAllPositionsUTF8>; using FunctionMultiSearchAllPositionsCaseInsensitive = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitive>; using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition, NameMultiSearchAllPositionsCaseInsensitiveUTF8>; using FunctionMultiSearch = FunctionsMultiStringSearch, NameMultiSearchAny>; using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch, NameMultiSearchAnyUTF8>; using FunctionMultiSearchCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitive>; using FunctionMultiSearchCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchAnyCaseInsensitiveUTF8>; using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch, NameMultiSearchFirstIndex>; using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstIndexUTF8>; using FunctionMultiSearchFirstIndexCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitive>; using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstIndexCaseInsensitiveUTF8>; using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch, NameMultiSearchFirstPosition>; using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstPositionUTF8>; using FunctionMultiSearchFirstPositionCaseInsensitive = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitive>; using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch, NameMultiSearchFirstPositionCaseInsensitiveUTF8>; void registerFunctionsStringSearch(FunctionFactory & factory) { factory.registerFunction(FunctionFactory::CaseInsensitive); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerFunction(); factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive); } }