MultiVolnitsky added with tests and some benchmark, many multiFunctions are added to support multistring search

This commit is contained in:
Danila Kutenin 2019-01-14 18:54:47 +03:00
parent 72df7ceee6
commit a5669b6b7a
6 changed files with 24802 additions and 442 deletions

View File

@ -1,15 +1,19 @@
#pragma once #pragma once
#include <Common/StringSearcher.h> #include <vector>
#include <Common/StringUtils/StringUtils.h> #include <stdint.h>
#include <string.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Core/Types.h> #include <Core/Types.h>
#include <Poco/UTF8Encoding.h> #include <Poco/UTF8Encoding.h>
#include <Poco/Unicode.h> #include <Poco/Unicode.h>
#include <Common/StringSearcher.h>
#include <Common/StringUtils/StringUtils.h>
#include <common/StringRef.h>
#include <common/unaligned.h> #include <common/unaligned.h>
#include <ext/range.h> #include <ext/range.h>
#include <stdint.h>
#include <string.h>
/** Search for a substring in a string by Volnitsky's algorithm /** Search for a substring in a string by Volnitsky's algorithm
* http://volnitsky.com/project/str_search/ * http://volnitsky.com/project/str_search/
@ -28,117 +32,38 @@
* - if it did not match, we check the next cell of the hash table from the collision resolution chain; * - if it did not match, we check the next cell of the hash table from the collision resolution chain;
* - if not found, skip to haystack almost the size of the needle bytes; * - if not found, skip to haystack almost the size of the needle bytes;
* *
* Unaligned memory access is used. * MultiVersion:
* - Add bigrams to hash table with string index. Then the usual Volnitsky search is used.
* - We are adding while searching, limiting the number of fallback searchers and the total number of added bigrams
*/ */
namespace DB namespace DB
{ {
namespace VolnitskyTraits
/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
template <typename CRTP>
class VolnitskyBase
{ {
protected:
using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255. using Offset = UInt8; /// Offset in the needle. For the basic algorithm, the length of the needle must not be greater than 255.
using Id = UInt8; /// Index of the string, must not be greater than 255.
using Ngram = UInt16; /// n-gram (2 bytes). using Ngram = UInt16; /// n-gram (2 bytes).
const UInt8 * const needle; /** Fits into the L2 cache (of common Intel CPUs).
const size_t needle_size; * This number is extremely good for compilers as it is numeric_limits<Uint16>::max() and there are optimizations with movzwl and other instructions with 2 bytes
const UInt8 * const needle_end = needle + needle_size; */
/// For how long we move, if the n-gram from haystack is not found in the hash table. static constexpr size_t hash_size = 64 * 1024;
const size_t step = needle_size - sizeof(Ngram) + 1;
/** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
* storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
static const size_t hash_size = 64 * 1024; /// Fits into the L2 cache (of common Intel CPUs).
Offset hash[hash_size]; /// Hash table.
/// min haystack size to use main algorithm instead of fallback /// min haystack size to use main algorithm instead of fallback
static constexpr auto min_haystack_size_for_algorithm = 20000; static constexpr size_t min_haystack_size_for_algorithm = 20000;
const bool fallback; /// Do we need to use the fallback algorithm.
public: static inline bool isFallbackNeedle(const size_t needle_size, size_t haystack_size_hint = 0)
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
*/
VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
: needle{reinterpret_cast<const UInt8 *>(needle)}, needle_size{needle_size},
fallback{
needle_size < 2 * sizeof(Ngram)
|| needle_size >= std::numeric_limits<Offset>::max()
|| (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm)}
{ {
if (fallback) return needle_size < 2 * sizeof(Ngram) || needle_size >= std::numeric_limits<Offset>::max()
return; || (haystack_size_hint && haystack_size_hint < min_haystack_size_for_algorithm);
memset(hash, 0, sizeof(hash));
/// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
for (auto i = static_cast<int>(needle_size - sizeof(Ngram)); i >= 0; --i)
self().putNGram(this->needle + i, i + 1, this->needle);
} }
static inline Ngram toNGram(const UInt8 * const pos) { return unalignedLoad<Ngram>(pos); }
/// If not found, the end of the haystack is returned. template <class Callback>
const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const static inline void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset, const Callback & putNGramBase)
{
if (needle_size == 0)
return haystack;
const auto haystack_end = haystack + haystack_size;
if (needle_size == 1 || fallback || haystack_size <= needle_size)
return self().search_fallback(haystack, haystack_end);
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
const auto * pos = haystack + needle_size - sizeof(Ngram);
for (; pos <= haystack_end - needle_size; pos += step)
{
/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
for (size_t cell_num = toNGram(pos) % hash_size; hash[cell_num];
cell_num = (cell_num + 1) % hash_size)
{
/// When found - compare bytewise, using the offset from the hash table.
const auto res = pos - (hash[cell_num] - 1);
if (self().compare(res))
return res;
}
}
/// The remaining tail.
return self().search_fallback(pos - step + 1, haystack_end);
}
const char * search(const char * haystack, size_t haystack_size) const
{
return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size));
}
protected:
CRTP & self() { return static_cast<CRTP &>(*this); }
const CRTP & self() const { return const_cast<VolnitskyBase *>(this)->self(); }
static Ngram toNGram(const UInt8 * const pos)
{
return unalignedLoad<Ngram>(pos);
}
void putNGramBase(const Ngram ngram, const int offset)
{
/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
size_t cell_num = ngram % hash_size;
while (hash[cell_num])
cell_num = (cell_num + 1) % hash_size; /// Search for the next free cell.
hash[cell_num] = offset;
}
void putNGramASCIICaseInsensitive(const UInt8 * const pos, const int offset)
{ {
struct Chars struct Chars
{ {
@ -186,74 +111,18 @@ protected:
/// 1 combination: 01 /// 1 combination: 01
putNGramBase(n, offset); putNGramBase(n, offset);
} }
};
template <bool CaseSensitive, bool ASCII, class Callback>
template <bool CaseSensitive, bool ASCII> struct VolnitskyImpl; static inline void
putNGram(const UInt8 * const pos, const int offset, [[maybe_unused]] const UInt8 * const begin, const Callback & putNGramBase)
/// Case sensitive comparison
template <bool ASCII> struct VolnitskyImpl<true, ASCII> : VolnitskyBase<VolnitskyImpl<true, ASCII>>
{
VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
: VolnitskyBase<VolnitskyImpl<true, ASCII>>{needle_, needle_size_, haystack_size_hint},
fallback_searcher{needle_, needle_size_}
{ {
} if constexpr (CaseSensitive)
putNGramBase(toNGram(pos), offset);
void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const /*begin*/) else
{ {
this->putNGramBase(this->toNGram(pos), offset); if constexpr (ASCII)
} putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
else
bool compare(const UInt8 * const pos) const
{
/// @todo: maybe just use memcmp for this case and rely on internal SSE optimization as in case with memcpy?
return fallback_searcher.compare(pos);
}
const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
{
return fallback_searcher.search(haystack, haystack_end);
}
ASCIICaseSensitiveStringSearcher fallback_searcher;
};
/// Case-insensitive ASCII
template <> struct VolnitskyImpl<false, true> : VolnitskyBase<VolnitskyImpl<false, true>>
{
VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
: VolnitskyBase{needle_, needle_size_, haystack_size_hint}, fallback_searcher{needle_, needle_size_}
{
}
void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const /*begin*/)
{
putNGramASCIICaseInsensitive(pos, offset);
}
bool compare(const UInt8 * const pos) const
{
return fallback_searcher.compare(pos);
}
const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const
{
return fallback_searcher.search(haystack, haystack_end);
}
ASCIICaseInsensitiveStringSearcher fallback_searcher;
};
/// Case-sensitive UTF-8
template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<false, false>>
{
VolnitskyImpl(const char * const needle_, const size_t needle_size_, const size_t haystack_size_hint = 0)
: VolnitskyBase{needle_, needle_size_, haystack_size_hint}, fallback_searcher{needle_, needle_size_}
{
}
void putNGram(const UInt8 * const pos, const int offset, const UInt8 * const begin)
{ {
struct Chars struct Chars
{ {
@ -263,16 +132,14 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
union union
{ {
Ngram n; VolnitskyTraits::Ngram n;
Chars chars; Chars chars;
}; };
n = toNGram(pos); n = toNGram(pos);
if (isascii(chars.c0) && isascii(chars.c1)) if (isascii(chars.c0) && isascii(chars.c1))
{ putNGramASCIICaseInsensitive(pos, offset, putNGramBase);
putNGramASCIICaseInsensitive(pos, offset);
}
else else
{ {
/** n-gram (in the case of n = 2) /** n-gram (in the case of n = 2)
@ -435,25 +302,363 @@ template <> struct VolnitskyImpl<false, false> : VolnitskyBase<VolnitskyImpl<fal
} }
} }
} }
}
bool compare(const UInt8 * const pos) const
{
return fallback_searcher.compare(pos);
} }
const UInt8 * search_fallback(const UInt8 * const haystack, const UInt8 * const haystack_end) const } // namespace VolnitskyTraits
template <bool CaseSensitive, bool ASCII, class FallbackSearcher>
class MultiVolnitskyBase
{
private:
/// needles and their offsets
const std::vector<String> & needles;
/// fallback searchers
std::vector<size_t> fallback_needles;
std::vector<FallbackSearcher> fallback_searchers;
/// because std::pair<> is not POD
struct OffsetId
{ {
return fallback_searcher.search(haystack, haystack_end); VolnitskyTraits::Id id;
VolnitskyTraits::Offset off;
};
OffsetId hash[VolnitskyTraits::hash_size];
/// step for each bunch of strings
size_t step;
/// last index of offsets that was not processed
size_t last;
/// limit for adding to hashtable. In worst case with case insentive search, the table will be filled at most as half
static constexpr size_t small_limit = VolnitskyTraits::hash_size / 8;
public:
MultiVolnitskyBase(const std::vector<String> & needles_) : needles{needles_}, step{0}, last{0}
{
fallback_searchers.reserve(needles.size());
} }
UTF8CaseInsensitiveStringSearcher fallback_searcher; /// returns vector of the positions
std::vector<const char *> search_all(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets)
{
const size_t haystack_string_size = haystack_offsets.size();
const size_t needles_size = needles.size();
std::vector<const char *> ans(needles_size * haystack_string_size, nullptr);
while (!reset())
{
size_t fallback_size = fallback_needles.size();
size_t prev_offset = 0;
for (size_t j = 0, from = 0; j < haystack_string_size; ++j, from += needles_size)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
for (size_t i = 0; i < fallback_size; ++i)
{
const UInt8 * ptr = fallback_searchers[fallback_needles[i]].search(haystack, haystack_end);
if (ptr != haystack_end)
ans[from + fallback_needles[i]] = reinterpret_cast<const char *>(ptr);
}
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (!ans[from + ind] && res + needles[ind].size() <= haystack_end)
{
if (fallback_searchers[ind].compare(res))
{
ans[from + ind] = reinterpret_cast<const char *>(res);
}
}
}
}
}
}
prev_offset = haystack_offsets[j];
}
}
return ans;
}
std::vector<char> search(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets)
{
const size_t haystack_string_size = haystack_offsets.size();
std::vector<char> ans(haystack_string_size, 0);
while (!reset())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
ans[j] = search_one(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
return ans;
}
std::vector<size_t> search_index(const ColumnString::Chars & haystack_data, const ColumnString::Offsets & haystack_offsets)
{
const size_t haystack_string_size = haystack_offsets.size();
std::vector<size_t> ans(haystack_string_size, 0);
while (!reset())
{
size_t prev_offset = 0;
for (size_t j = 0; j < haystack_string_size; ++j)
{
const auto * haystack = &haystack_data[prev_offset];
const auto * haystack_end = haystack + haystack_offsets[j] - prev_offset - 1;
ans[j] = search_one_index(haystack, haystack_end);
prev_offset = haystack_offsets[j];
}
}
return ans;
}
private:
bool reset()
{
if (last == needles.size())
return true;
memset(hash, 0, sizeof(hash));
fallback_needles.clear();
step = std::numeric_limits<size_t>::max();
size_t buf = 0;
size_t size = needles.size();
for (; last < size; ++last)
{
const char * cur_needle_data = needles[last].data();
const size_t cur_needle_size = needles[last].size();
/// save the indices of fallback searchers
if (VolnitskyTraits::isFallbackNeedle(cur_needle_size))
{
fallback_needles.push_back(last);
}
else
{
/// put all bigrams
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) {
return this->putNGramBase(ngram, offset, this->last);
};
buf += cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
if (buf > small_limit)
{
break;
}
step = std::min(step, cur_needle_size - sizeof(VolnitskyTraits::Ngram) + 1);
for (auto i = static_cast<int>(cur_needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
{
VolnitskyTraits::putNGram<CaseSensitive, ASCII>(
reinterpret_cast<const UInt8 *>(cur_needle_data) + i,
i + 1,
reinterpret_cast<const UInt8 *>(cur_needle_data),
callback);
}
}
fallback_searchers.emplace_back(cur_needle_data, cur_needle_size);
}
return false;
}
inline bool search_one(const UInt8 * haystack, const UInt8 * haystack_end)
{
const size_t fallback_size = fallback_needles.size();
for (size_t i = 0; i < fallback_size; ++i)
if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
return true;
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size() <= haystack_end && fallback_searchers[ind].compare(res))
return true;
}
}
}
}
return false;
}
inline size_t search_one_index(const UInt8 * haystack, const UInt8 * haystack_end)
{
const size_t fallback_size = fallback_needles.size();
size_t ans = std::numeric_limits<size_t>::max();
for (size_t i = 0; i < fallback_size; ++i)
if (fallback_searchers[fallback_needles[i]].search(haystack, haystack_end) != haystack_end)
ans = std::min(ans, fallback_needles[i]);
/// check if we have one non empty volnitsky searcher
if (step != std::numeric_limits<size_t>::max())
{
const auto * pos = haystack + step - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - sizeof(VolnitskyTraits::Ngram); pos += step)
{
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num].off;
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
if (pos >= haystack + hash[cell_num].off - 1)
{
const auto res = pos - (hash[cell_num].off - 1);
const size_t ind = hash[cell_num].id;
if (res + needles[ind].size() <= haystack_end && fallback_searchers[ind].compare(res))
ans = std::min(ans, ind);
}
}
}
}
/*
* NOTE!!! if nothing was found, ans + 1 will be equal to zero and we can
* std::copy it into the result because we need to return the position starting with one
*/
return ans + 1;
}
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset, const size_t num)
{
size_t cell_num = ngram % VolnitskyTraits::hash_size;
while (hash[cell_num].off)
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size;
hash[cell_num] = {static_cast<VolnitskyTraits::Id>(num), static_cast<VolnitskyTraits::Offset>(offset)};
}
}; };
/// @todo store lowercase needle to speed up in case there are numerous occurrences of bigrams from needle in haystack
template <bool CaseSensitive, bool ASCII, class FallbackSearcher>
class VolnitskyBase
{
protected:
const UInt8 * const needle;
const size_t needle_size;
const UInt8 * const needle_end = needle + needle_size;
/// For how long we move, if the n-gram from haystack is not found in the hash table.
const size_t step = needle_size - sizeof(VolnitskyTraits::Ngram) + 1;
using Volnitsky = VolnitskyImpl<true, true>; /** max needle length is 255, max distinct ngrams for case-sensitive is (255 - 1), case-insensitive is 4 * (255 - 1)
using VolnitskyUTF8 = VolnitskyImpl<true, false>; /// exactly same as Volnitsky * storage of 64K ngrams (n = 2, 128 KB) should be large enough for both cases */
using VolnitskyCaseInsensitive = VolnitskyImpl<false, true>; /// ignores non-ASCII bytes VolnitskyTraits::Offset hash[VolnitskyTraits::hash_size]; /// Hash table.
using VolnitskyCaseInsensitiveUTF8 = VolnitskyImpl<false, false>;
const bool fallback; /// Do we need to use the fallback algorithm.
FallbackSearcher fallback_searcher;
public:
/** haystack_size_hint - the expected total size of the haystack for `search` calls. Optional (zero means unspecified).
* If you specify it small enough, the fallback algorithm will be used,
* since it is considered that it's useless to waste time initializing the hash table.
*/
VolnitskyBase(const char * const needle, const size_t needle_size, size_t haystack_size_hint = 0)
: needle{reinterpret_cast<const UInt8 *>(needle)}
, needle_size{needle_size}
, fallback{VolnitskyTraits::isFallbackNeedle(needle_size, haystack_size_hint)}
, fallback_searcher{needle, needle_size}
{
if (fallback)
return;
memset(hash, 0, sizeof(hash));
auto callback = [this](const VolnitskyTraits::Ngram ngram, const int offset) { return this->putNGramBase(ngram, offset); };
/// int is used here because unsigned can't be used with condition like `i >= 0`, unsigned always >= 0
for (auto i = static_cast<int>(needle_size - sizeof(VolnitskyTraits::Ngram)); i >= 0; --i)
VolnitskyTraits::putNGram<CaseSensitive, ASCII>(this->needle + i, i + 1, this->needle, callback);
}
/// If not found, the end of the haystack is returned.
const UInt8 * search(const UInt8 * const haystack, const size_t haystack_size) const
{
if (needle_size == 0)
return haystack;
const auto haystack_end = haystack + haystack_size;
if (fallback || haystack_size <= needle_size)
return fallback_searcher.search(haystack, haystack_end);
/// Let's "apply" the needle to the haystack and compare the n-gram from the end of the needle.
const auto * pos = haystack + needle_size - sizeof(VolnitskyTraits::Ngram);
for (; pos <= haystack_end - needle_size; pos += step)
{
/// We look at all the cells of the hash table that can correspond to the n-gram from haystack.
for (size_t cell_num = VolnitskyTraits::toNGram(pos) % VolnitskyTraits::hash_size; hash[cell_num];
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size)
{
/// When found - compare bytewise, using the offset from the hash table.
const auto res = pos - (hash[cell_num] - 1);
/// pointer in the code is always padded array so we can use pagesafe semantics
if (fallback_searcher.compare(res))
return res;
}
}
return fallback_searcher.search(pos - step + 1, haystack_end);
}
const char * search(const char * haystack, size_t haystack_size) const
{
return reinterpret_cast<const char *>(search(reinterpret_cast<const UInt8 *>(haystack), haystack_size));
}
protected:
void putNGramBase(const VolnitskyTraits::Ngram ngram, const int offset)
{
/// Put the offset for the n-gram in the corresponding cell or the nearest free cell.
size_t cell_num = ngram % VolnitskyTraits::hash_size;
while (hash[cell_num])
cell_num = (cell_num + 1) % VolnitskyTraits::hash_size; /// Search for the next free cell.
hash[cell_num] = offset;
}
};
using Volnitsky = VolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using VolnitskyUTF8 = VolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>; /// exactly same as Volnitsky
using VolnitskyCaseInsensitive = VolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>; /// ignores non-ASCII bytes
using VolnitskyCaseInsensitiveUTF8 = VolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
using MultiVolnitsky = MultiVolnitskyBase<true, true, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyUTF8 = MultiVolnitskyBase<true, false, ASCIICaseSensitiveStringSearcher>;
using MultiVolnitskyCaseInsensitive = MultiVolnitskyBase<false, true, ASCIICaseInsensitiveStringSearcher>;
using MultiVolnitskyCaseInsensitiveUTF8 = MultiVolnitskyBase<false, false, UTF8CaseInsensitiveStringSearcher>;
} }

View File

@ -1,28 +1,28 @@
#include <Functions/FunctionsStringSearch.h> #include <Functions/FunctionsStringSearch.h>
#include <memory>
#include <mutex>
#include <Poco/UTF8String.h>
#include <Columns/ColumnFixedString.h> #include <Columns/ColumnFixedString.h>
#include <Common/Volnitsky.h> #include <Common/config.h>
#include <DataTypes/DataTypeFixedString.h> #include <DataTypes/DataTypeFixedString.h>
#include <Functions/FunctionFactory.h> #include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h> #include <Functions/Regexps.h>
#include <IO/WriteHelpers.h> #include <IO/WriteHelpers.h>
#include <Common/config.h>
#include <re2/re2.h> #include <re2/re2.h>
#include <re2/stringpiece.h> #include <re2/stringpiece.h>
#include <Poco/UTF8String.h>
#include <Common/Volnitsky.h>
#include <algorithm>
#include <memory>
#if USE_RE2_ST #if USE_RE2_ST
#include <re2_st/re2.h> // Y_IGNORE # include <re2_st/re2.h> // Y_IGNORE
#else #else
#define re2_st re2 # define re2_st re2
#endif #endif
namespace DB namespace DB
{ {
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int BAD_ARGUMENTS; extern const int BAD_ARGUMENTS;
@ -35,7 +35,10 @@ namespace ErrorCodes
struct PositionCaseSensitiveASCII struct PositionCaseSensitiveASCII
{ {
/// For searching single substring inside big-enough contiguous chunk of data. Coluld have slightly expensive initialization. /// For searching single substring inside big-enough contiguous chunk of data. Coluld have slightly expensive initialization.
using SearcherInBigHaystack = VolnitskyImpl<true, true>; using SearcherInBigHaystack = Volnitsky;
/// For search many substrings in one string
using MultiSearcherInBigHaystack = MultiVolnitsky;
/// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization. /// For searching single substring, that is different each time. This object is created for each row of data. It must have cheap initialization.
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher; using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
@ -50,23 +53,24 @@ struct PositionCaseSensitiveASCII
return SearcherInSmallHaystack(needle_data, needle_size); return SearcherInSmallHaystack(needle_data, needle_size);
} }
/// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8). static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<String> & needles)
static size_t countChars(const char * begin, const char * end)
{ {
return end - begin; return MultiSearcherInBigHaystack(needles);
} }
/// Number of code points between 'begin' and 'end' (this has different behaviour for ASCII and UTF-8).
static size_t countChars(const char * begin, const char * end) { return end - begin; }
/// Convert string to lowercase. Only for case-insensitive search. /// Convert string to lowercase. Only for case-insensitive search.
/// Implementation is permitted to be inefficient because it is called for single string. /// Implementation is permitted to be inefficient because it is called for single string.
static void toLowerIfNeed(std::string &) static void toLowerIfNeed(std::string &) {}
{
}
}; };
struct PositionCaseInsensitiveASCII struct PositionCaseInsensitiveASCII
{ {
/// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it. /// `Volnitsky` is not used here, because one person has measured that this is better. It will be good if you question it.
using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher; using SearcherInBigHaystack = ASCIICaseInsensitiveStringSearcher;
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitive;
using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher; using SearcherInSmallHaystack = LibCASCIICaseInsensitiveStringSearcher;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/) static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t /*haystack_size_hint*/)
@ -79,20 +83,20 @@ struct PositionCaseInsensitiveASCII
return SearcherInSmallHaystack(needle_data, needle_size); return SearcherInSmallHaystack(needle_data, needle_size);
} }
static size_t countChars(const char * begin, const char * end) static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<String> & needles)
{ {
return end - begin; return MultiSearcherInBigHaystack(needles);
} }
static void toLowerIfNeed(std::string & s) static size_t countChars(const char * begin, const char * end) { return end - begin; }
{
std::transform(std::begin(s), std::end(s), std::begin(s), tolower); static void toLowerIfNeed(std::string & s) { std::transform(std::begin(s), std::end(s), std::begin(s), tolower); }
}
}; };
struct PositionCaseSensitiveUTF8 struct PositionCaseSensitiveUTF8
{ {
using SearcherInBigHaystack = VolnitskyImpl<true, false>; using SearcherInBigHaystack = VolnitskyUTF8;
using MultiSearcherInBigHaystack = MultiVolnitskyUTF8;
using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher; using SearcherInSmallHaystack = LibCASCIICaseSensitiveStringSearcher;
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
@ -105,6 +109,11 @@ struct PositionCaseSensitiveUTF8
return SearcherInSmallHaystack(needle_data, needle_size); return SearcherInSmallHaystack(needle_data, needle_size);
} }
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<String> & needles)
{
return MultiSearcherInBigHaystack(needles);
}
static size_t countChars(const char * begin, const char * end) static size_t countChars(const char * begin, const char * end)
{ {
size_t res = 0; size_t res = 0;
@ -114,14 +123,13 @@ struct PositionCaseSensitiveUTF8
return res; return res;
} }
static void toLowerIfNeed(std::string &) static void toLowerIfNeed(std::string &) {}
{
}
}; };
struct PositionCaseInsensitiveUTF8 struct PositionCaseInsensitiveUTF8
{ {
using SearcherInBigHaystack = VolnitskyImpl<false, false>; using SearcherInBigHaystack = VolnitskyCaseInsensitiveUTF8;
using MultiSearcherInBigHaystack = MultiVolnitskyCaseInsensitiveUTF8;
using SearcherInSmallHaystack = UTF8CaseInsensitiveStringSearcher; /// TODO Very suboptimal. using SearcherInSmallHaystack = UTF8CaseInsensitiveStringSearcher; /// TODO Very suboptimal.
static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint) static SearcherInBigHaystack createSearcherInBigHaystack(const char * needle_data, size_t needle_size, size_t haystack_size_hint)
@ -134,6 +142,11 @@ struct PositionCaseInsensitiveUTF8
return SearcherInSmallHaystack(needle_data, needle_size); return SearcherInSmallHaystack(needle_data, needle_size);
} }
static MultiSearcherInBigHaystack createMultiSearcherInBigHaystack(const std::vector<String> & needles)
{
return MultiSearcherInBigHaystack(needles);
}
static size_t countChars(const char * begin, const char * end) static size_t countChars(const char * begin, const char * end)
{ {
size_t res = 0; size_t res = 0;
@ -143,10 +156,7 @@ struct PositionCaseInsensitiveUTF8
return res; return res;
} }
static void toLowerIfNeed(std::string & s) static void toLowerIfNeed(std::string & s) { Poco::UTF8::toLowerInPlace(s); }
{
Poco::UTF8::toLowerInPlace(s);
}
}; };
template <typename Impl> template <typename Impl>
@ -155,10 +165,8 @@ struct PositionImpl
using ResultType = UInt64; using ResultType = UInt64;
/// Find one substring in many strings. /// Find one substring in many strings.
static void vector_constant(const ColumnString::Chars & data, static void vector_constant(
const ColumnString::Offsets & offsets, const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & needle, PaddedPODArray<UInt64> & res)
const std::string & needle,
PaddedPODArray<UInt64> & res)
{ {
const UInt8 * begin = data.data(); const UInt8 * begin = data.data();
const UInt8 * pos = begin; const UInt8 * pos = begin;
@ -210,7 +218,8 @@ struct PositionImpl
} }
/// Search each time for a different single substring inside each time different string. /// Search each time for a different single substring inside each time different string.
static void vector_vector(const ColumnString::Chars & haystack_data, static void vector_vector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets, const ColumnString::Offsets & haystack_offsets,
const ColumnString::Chars & needle_data, const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets, const ColumnString::Offsets & needle_offsets,
@ -234,8 +243,8 @@ struct PositionImpl
else else
{ {
/// It is assumed that the StringSearcher is not very difficult to initialize. /// It is assumed that the StringSearcher is not very difficult to initialize.
typename Impl::SearcherInSmallHaystack searcher typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
= Impl::createSearcherInSmallHaystack(reinterpret_cast<const char *>(&needle_data[prev_needle_offset]), reinterpret_cast<const char *>(&needle_data[prev_needle_offset]),
needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end needle_offsets[i] - prev_needle_offset - 1); /// zero byte at the end
/// searcher returns a pointer to the found substring or to the end of `haystack`. /// searcher returns a pointer to the found substring or to the end of `haystack`.
@ -244,7 +253,9 @@ struct PositionImpl
if (pos != haystack_size) if (pos != haystack_size)
{ {
res[i] = 1 + Impl::countChars(reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]), res[i] = 1
+ Impl::countChars(
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset]),
reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset + pos])); reinterpret_cast<const char *>(&haystack_data[prev_haystack_offset + pos]));
} }
else else
@ -257,7 +268,8 @@ struct PositionImpl
} }
/// Find many substrings in one line. /// Find many substrings in one line.
static void constant_vector(const String & haystack, static void constant_vector(
const String & haystack,
const ColumnString::Chars & needle_data, const ColumnString::Chars & needle_data,
const ColumnString::Offsets & needle_offsets, const ColumnString::Offsets & needle_offsets,
PaddedPODArray<UInt64> & res) PaddedPODArray<UInt64> & res)
@ -281,7 +293,8 @@ struct PositionImpl
typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack( typename Impl::SearcherInSmallHaystack searcher = Impl::createSearcherInSmallHaystack(
reinterpret_cast<const char *>(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1); reinterpret_cast<const char *>(&needle_data[prev_needle_offset]), needle_offsets[i] - prev_needle_offset - 1);
size_t pos = searcher.search(reinterpret_cast<const UInt8 *>(haystack.data()), size_t pos = searcher.search(
reinterpret_cast<const UInt8 *>(haystack.data()),
reinterpret_cast<const UInt8 *>(haystack.data()) + haystack.size()) reinterpret_cast<const UInt8 *>(haystack.data()) + haystack.size())
- reinterpret_cast<const UInt8 *>(haystack.data()); - reinterpret_cast<const UInt8 *>(haystack.data());
@ -298,6 +311,71 @@ struct PositionImpl
} }
}; };
template <typename Impl>
struct MultiPositionImpl
{
using ResultType = UInt64;
static void multi_constant_vector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<String> & needles,
PaddedPODArray<UInt64> & res)
{
const size_t needles_size = needles.size();
const size_t haystack_offsets_size = haystack_offsets.size();
size_t k = 0;
const auto result = Impl::createMultiSearcherInBigHaystack(needles).search_all(haystack_data, haystack_offsets);
for (size_t j = 0; j < haystack_offsets_size; ++j)
{
for (size_t i = 0; i < needles_size; ++i)
{
const char * ptr = result[k];
if (ptr)
{
const char * start = reinterpret_cast<const char *>(&haystack_data[j == 0 ? 0 : haystack_offsets[j - 1]]);
res[k] = 1 + Impl::countChars(start, ptr);
}
else
res[k] = 0;
++k;
}
}
}
};
template <typename Impl>
struct MultiSearchImpl
{
using ResultType = UInt64;
static void multi_constant_vector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<String> & needles,
PaddedPODArray<UInt64> & res)
{
const auto result = Impl::createMultiSearcherInBigHaystack(needles).search(haystack_data, haystack_offsets);
std::copy(result.begin(), result.end(), res.begin());
}
};
template <typename Impl>
struct FirstMatchImpl
{
using ResultType = UInt64;
static void multi_constant_vector(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<String> & needles,
PaddedPODArray<UInt64> & res)
{
const auto result = Impl::createMultiSearcherInBigHaystack(needles).search_index(haystack_data, haystack_offsets);
std::copy(result.begin(), result.end(), res.begin());
}
};
/// Is the LIKE expression reduced to finding a substring in a string? /// Is the LIKE expression reduced to finding a substring in a string?
inline bool likePatternIsStrstr(const String & pattern, String & res) inline bool likePatternIsStrstr(const String & pattern, String & res)
@ -348,10 +426,8 @@ struct MatchImpl
{ {
using ResultType = UInt8; using ResultType = UInt8;
static void vector_constant(const ColumnString::Chars & data, static void vector_constant(
const ColumnString::Offsets & offsets, const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
const std::string & pattern,
PaddedPODArray<UInt8> & res)
{ {
if (offsets.empty()) if (offsets.empty())
return; return;
@ -473,7 +549,8 @@ struct MatchImpl
if (required_substring_is_prefix) if (required_substring_is_prefix)
res[i] = revert res[i] = revert
^ regexp->getRE2()->Match(re2_st::StringPiece(str_data, str_size), ^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, str_size),
reinterpret_cast<const char *>(pos) - str_data, reinterpret_cast<const char *>(pos) - str_data,
str_size, str_size,
re2_st::RE2::UNANCHORED, re2_st::RE2::UNANCHORED,
@ -504,13 +581,15 @@ struct MatchImpl
res = revert ^ regexp->match(data); res = revert ^ regexp->match(data);
} }
template <typename... Args> static void vector_vector(Args &&...) template <typename... Args>
static void vector_vector(Args &&...)
{ {
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN); throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
} }
/// Search different needles in single haystack. /// Search different needles in single haystack.
template <typename... Args> static void constant_vector(Args &&...) template <typename... Args>
static void constant_vector(Args &&...)
{ {
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN); throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
} }
@ -519,7 +598,8 @@ struct MatchImpl
struct ExtractImpl struct ExtractImpl
{ {
static void vector(const ColumnString::Chars & data, static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets, const ColumnString::Offsets & offsets,
const std::string & pattern, const std::string & pattern,
ColumnString::Chars & res_data, ColumnString::Chars & res_data,
@ -613,16 +693,17 @@ struct ReplaceRegexpImpl
for (const auto & it : instructions) for (const auto & it : instructions)
if (it.first >= num_captures) if (it.first >= num_captures)
throw Exception("Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only " throw Exception(
+ toString(num_captures - 1) "Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+ " subpatterns", + toString(num_captures - 1) + " subpatterns",
ErrorCodes::BAD_ARGUMENTS); ErrorCodes::BAD_ARGUMENTS);
return instructions; return instructions;
} }
static void processString(const re2_st::StringPiece & input, static void processString(
const re2_st::StringPiece & input,
ColumnString::Chars & res_data, ColumnString::Chars & res_data,
ColumnString::Offset & res_offset, ColumnString::Offset & res_offset,
re2_st::RE2 & searcher, re2_st::RE2 & searcher,
@ -687,7 +768,8 @@ struct ReplaceRegexpImpl
} }
static void vector(const ColumnString::Chars & data, static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets, const ColumnString::Offsets & offsets,
const std::string & needle, const std::string & needle,
const std::string & replacement, const std::string & replacement,
@ -715,7 +797,8 @@ struct ReplaceRegexpImpl
} }
} }
static void vector_fixed(const ColumnString::Chars & data, static void vector_fixed(
const ColumnString::Chars & data,
size_t n, size_t n,
const std::string & needle, const std::string & needle,
const std::string & replacement, const std::string & replacement,
@ -749,7 +832,8 @@ struct ReplaceRegexpImpl
template <bool replace_one = false> template <bool replace_one = false>
struct ReplaceStringImpl struct ReplaceStringImpl
{ {
static void vector(const ColumnString::Chars & data, static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets, const ColumnString::Offsets & offsets,
const std::string & needle, const std::string & needle,
const std::string & replacement, const std::string & replacement,
@ -824,7 +908,8 @@ struct ReplaceStringImpl
/// Note: this function converts fixed-length strings to variable-length strings /// Note: this function converts fixed-length strings to variable-length strings
/// and each variable-length string should ends with zero byte. /// and each variable-length string should ends with zero byte.
static void vector_fixed(const ColumnString::Chars & data, static void vector_fixed(
const ColumnString::Chars & data,
size_t n, size_t n,
const std::string & needle, const std::string & needle,
const std::string & replacement, const std::string & replacement,
@ -851,7 +936,8 @@ struct ReplaceStringImpl
const UInt8 * match = searcher.search(pos, end - pos); const UInt8 * match = searcher.search(pos, end - pos);
#define COPY_REST_OF_CURRENT_STRING() \ #define COPY_REST_OF_CURRENT_STRING() \
do { \ do \
{ \
const size_t len = begin + n * (i + 1) - pos; \ const size_t len = begin + n * (i + 1) - pos; \
res_data.resize(res_data.size() + len + 1); \ res_data.resize(res_data.size() + len + 1); \
memcpy(&res_data[res_offset], pos, len); \ memcpy(&res_data[res_offset], pos, len); \
@ -935,20 +1021,11 @@ class FunctionStringReplace : public IFunction
{ {
public: public:
static constexpr auto name = Name::name; static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) static FunctionPtr create(const Context &) { return std::make_shared<FunctionStringReplace>(); }
{
return std::make_shared<FunctionStringReplace>();
}
String getName() const override String getName() const override { return name; }
{
return name;
}
size_t getNumberOfArguments() const override size_t getNumberOfArguments() const override { return 3; }
{
return 3;
}
bool useDefaultImplementationForConstants() const override { return true; } bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
@ -956,15 +1033,18 @@ public:
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{ {
if (!isStringOrFixedString(arguments[0])) if (!isStringOrFixedString(arguments[0]))
throw Exception("Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(), throw Exception(
"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[1])) if (!isStringOrFixedString(arguments[1]))
throw Exception("Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(), throw Exception(
"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[2])) if (!isStringOrFixedString(arguments[2]))
throw Exception("Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(), throw Exception(
"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT); ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>(); return std::make_shared<DataTypeString>();
@ -1025,6 +1105,54 @@ struct NamePositionCaseInsensitiveUTF8
{ {
static constexpr auto name = "positionCaseInsensitiveUTF8"; static constexpr auto name = "positionCaseInsensitiveUTF8";
}; };
struct NameMultiPosition
{
static constexpr auto name = "multiPosition";
};
struct NameMultiPositionUTF8
{
static constexpr auto name = "multiPositionUTF8";
};
struct NameMultiPositionCaseInsensitive
{
static constexpr auto name = "multiPositionCaseInsensitive";
};
struct NameMultiPositionCaseInsensitiveUTF8
{
static constexpr auto name = "multiPositionCaseInsensitiveUTF8";
};
struct NameMultiSearch
{
static constexpr auto name = "multiSearch";
};
struct NameMultiSearchUTF8
{
static constexpr auto name = "multiSearchUTF8";
};
struct NameMultiSearchCaseInsensitive
{
static constexpr auto name = "multiSearchCaseInsensitive";
};
struct NameMultiSearchCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchCaseInsensitiveUTF8";
};
struct NameFirstMatch
{
static constexpr auto name = "firstMatch";
};
struct NameFirstMatchUTF8
{
static constexpr auto name = "firstMatchUTF8";
};
struct NameFirstMatchCaseInsensitive
{
static constexpr auto name = "firstMatchCaseInsensitive";
};
struct NameFirstMatchCaseInsensitiveUTF8
{
static constexpr auto name = "firstMatchCaseInsensitiveUTF8";
};
struct NameMatch struct NameMatch
{ {
static constexpr auto name = "match"; static constexpr auto name = "match";
@ -1064,6 +1192,27 @@ using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<Posit
using FunctionPositionCaseInsensitiveUTF8 using FunctionPositionCaseInsensitiveUTF8
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>; = FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
using FunctionMultiPosition = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveASCII>, NameMultiPosition>;
using FunctionMultiPositionUTF8 = FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseSensitiveUTF8>, NameMultiPositionUTF8>;
using FunctionMultiPositionCaseInsensitive
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveASCII>, NameMultiPositionCaseInsensitive>;
using FunctionMultiPositionCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiPositionCaseInsensitiveUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearch>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchUTF8>;
using FunctionMultiSearchCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveASCII>, NameMultiSearchCaseInsensitive>;
using FunctionMultiSearchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchCaseInsensitiveUTF8>;
using FunctionFirstMatch = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveASCII>, NameFirstMatch>;
using FunctionFirstMatchUTF8 = FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseSensitiveUTF8>, NameFirstMatchUTF8>;
using FunctionFirstMatchCaseInsensitive
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveASCII>, NameFirstMatchCaseInsensitive>;
using FunctionFirstMatchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<FirstMatchImpl<PositionCaseInsensitiveUTF8>, NameFirstMatchCaseInsensitiveUTF8>;
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>; using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>; using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>; using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
@ -1080,14 +1229,32 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionReplaceAll>(); factory.registerFunction<FunctionReplaceAll>();
factory.registerFunction<FunctionReplaceRegexpOne>(); factory.registerFunction<FunctionReplaceRegexpOne>();
factory.registerFunction<FunctionReplaceRegexpAll>(); factory.registerFunction<FunctionReplaceRegexpAll>();
factory.registerFunction<FunctionPosition>(FunctionFactory::CaseInsensitive); factory.registerFunction<FunctionPosition>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionPositionUTF8>(); factory.registerFunction<FunctionPositionUTF8>();
factory.registerFunction<FunctionPositionCaseInsensitive>(); factory.registerFunction<FunctionPositionCaseInsensitive>();
factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>(); factory.registerFunction<FunctionPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiPosition>();
factory.registerFunction<FunctionMultiPositionUTF8>();
factory.registerFunction<FunctionMultiPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMultiSearch>();
factory.registerFunction<FunctionMultiSearchUTF8>();
factory.registerFunction<FunctionMultiSearchCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionFirstMatch>();
factory.registerFunction<FunctionFirstMatchUTF8>();
factory.registerFunction<FunctionFirstMatchCaseInsensitive>();
factory.registerFunction<FunctionFirstMatchCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMatch>(); factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>(); factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>(); factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>(); factory.registerFunction<FunctionExtract>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive); factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive); factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
} }

View File

@ -1,17 +1,21 @@
#pragma once #pragma once
#include <Columns/ColumnArray.h>
#include <Columns/ColumnConst.h> #include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h> #include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h> #include <Columns/ColumnVector.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h> #include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h> #include <DataTypes/DataTypesNumber.h>
#include <Functions/IFunction.h>
#include <Functions/FunctionHelpers.h> #include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <IO/WriteHelpers.h>
#include <common/StringRef.h>
#include <ext/range.h>
namespace DB namespace DB
{ {
/** Search and replace functions in strings: /** Search and replace functions in strings:
* *
* position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found. * position(haystack, needle) - the normal search for a substring in a string, returns the position (in bytes) of the found substring starting with 1, or 0 if no substring is found.
@ -35,12 +39,28 @@ namespace DB
* *
* replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence. * replaceRegexpOne(haystack, pattern, replacement) - replaces the pattern with the specified regexp, only the first occurrence.
* replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences. * replaceRegexpAll(haystack, pattern, replacement) - replaces the pattern with the specified type, all occurrences.
*
* multiPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find first occurences (positions) of all the const patterns inside haystack
* multiPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiPositionCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*
* multiSearch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- find any of the const patterns inside haystack and return 0 or 1
* multiSearchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatch(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first index of the matched string or zero if nothing was found
* firstMatchUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* firstMatchCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*/ */
namespace ErrorCodes namespace ErrorCodes
{ {
extern const int ILLEGAL_TYPE_OF_ARGUMENT; extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN; extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
} }
template <typename Impl, typename Name> template <typename Impl, typename Name>
@ -48,20 +68,11 @@ class FunctionsStringSearch : public IFunction
{ {
public: public:
static constexpr auto name = Name::name; static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearch>(); }
{
return std::make_shared<FunctionsStringSearch>();
}
String getName() const override String getName() const override { return name; }
{
return name;
}
size_t getNumberOfArguments() const override size_t getNumberOfArguments() const override { return 2; }
{
return 2;
}
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{ {
@ -90,7 +101,8 @@ public:
{ {
ResultType res{}; ResultType res{};
Impl::constant_constant(col_haystack_const->getValue<String>(), col_needle_const->getValue<String>(), res); Impl::constant_constant(col_haystack_const->getValue<String>(), col_needle_const->getValue<String>(), res);
block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res)); block.getByPosition(result).column
= block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
return; return;
} }
@ -103,20 +115,22 @@ public:
const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle); const ColumnString * col_needle_vector = checkAndGetColumn<ColumnString>(&*column_needle);
if (col_haystack_vector && col_needle_vector) if (col_haystack_vector && col_needle_vector)
Impl::vector_vector(col_haystack_vector->getChars(), Impl::vector_vector(
col_haystack_vector->getChars(),
col_haystack_vector->getOffsets(), col_haystack_vector->getOffsets(),
col_needle_vector->getChars(), col_needle_vector->getChars(),
col_needle_vector->getOffsets(), col_needle_vector->getOffsets(),
vec_res); vec_res);
else if (col_haystack_vector && col_needle_const) else if (col_haystack_vector && col_needle_const)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res); Impl::vector_constant(
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), col_needle_const->getValue<String>(), vec_res);
else if (col_haystack_const && col_needle_vector) else if (col_haystack_const && col_needle_vector)
Impl::constant_vector(col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res); Impl::constant_vector(
col_haystack_const->getValue<String>(), col_needle_vector->getChars(), col_needle_vector->getOffsets(), vec_res);
else else
throw Exception("Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and " throw Exception(
+ block.getByPosition(arguments[1]).column->getName() "Illegal columns " + block.getByPosition(arguments[0]).column->getName() + " and "
+ " of arguments of function " + block.getByPosition(arguments[1]).column->getName() + " of arguments of function " + getName(),
+ getName(),
ErrorCodes::ILLEGAL_COLUMN); ErrorCodes::ILLEGAL_COLUMN);
block.getByPosition(result).column = std::move(col_res); block.getByPosition(result).column = std::move(col_res);
@ -129,20 +143,11 @@ class FunctionsStringSearchToString : public IFunction
{ {
public: public:
static constexpr auto name = Name::name; static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) static FunctionPtr create(const Context &) { return std::make_shared<FunctionsStringSearchToString>(); }
{
return std::make_shared<FunctionsStringSearchToString>();
}
String getName() const override String getName() const override { return name; }
{
return name;
}
size_t getNumberOfArguments() const override size_t getNumberOfArguments() const override { return 2; }
{
return 2;
}
bool useDefaultImplementationForConstants() const override { return true; } bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; } ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1}; }
@ -186,4 +191,160 @@ public:
} }
}; };
template <typename Impl, typename Name>
class FunctionsMultiStringPosition : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringPosition>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>());
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
using ResultType = typename Impl::ResultType;
const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column;
const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
if (!col_const_arr)
throw Exception(
"Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const",
ErrorCodes::ILLEGAL_COLUMN);
Array src_arr = col_const_arr->getValue<Array>();
std::vector<String> refs;
for (const auto & el : src_arr)
{
refs.push_back(el.get<String>());
}
const size_t column_haystack_size = column_haystack->size();
auto col_res = ColumnVector<ResultType>::create();
auto col_offsets = ColumnArray::ColumnOffsets::create(column_haystack_size);
auto & vec_res = col_res->getData();
auto & offsets_res = col_offsets->getData();
vec_res.resize(column_haystack_size * refs.size());
if (col_haystack_vector)
Impl::multi_constant_vector(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
size_t refs_size = refs.size();
size_t accum = refs_size;
for (size_t i = 0; i < column_haystack_size; ++i, accum += refs_size)
{
offsets_res[i] = accum;
}
block.getByPosition(result).column = ColumnArray::create(std::move(col_res), std::move(col_offsets));
}
};
template <typename Impl, typename Name>
class FunctionsMultiStringSearch : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionsMultiStringSearch>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 2; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (arguments.size() + 1 >= std::numeric_limits<UInt8>::max())
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(arguments.size())
+ ", should be at most 255.",
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[1].get());
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
using ResultType = typename Impl::ResultType;
const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnPtr & arr_ptr = block.getByPosition(arguments[1]).column;
const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
if (!col_const_arr)
throw Exception(
"Illegal column " + block.getByPosition(arguments[1]).column->getName() + ". The array is not const",
ErrorCodes::ILLEGAL_COLUMN);
Array src_arr = col_const_arr->getValue<Array>();
std::vector<String> refs;
refs.reserve(src_arr.size());
for (const auto & el : src_arr)
refs.emplace_back(el.get<String>());
const size_t column_haystack_size = column_haystack->size();
auto col_res = ColumnVector<ResultType>::create();
auto & vec_res = col_res->getData();
vec_res.resize(column_haystack_size);
if (col_haystack_vector)
Impl::multi_constant_vector(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
block.getByPosition(result).column = std::move(col_res);
}
};
} }

View File

@ -0,0 +1,46 @@
<test>
<name>Constant column string search</name>
<tags>
<tag>search</tag>
</tags>
<preconditions>
<table_exists>hits_100m_single</table_exists>
</preconditions>
<type>loop</type>
<stop_conditions>
<all_of>
<iterations>5</iterations>
<min_time_not_changing_for_ms>10000</min_time_not_changing_for_ms>
</all_of>
<any_of>
<iterations>50</iterations>
<total_time_ms>60000</total_time_ms>
</any_of>
</stop_conditions>
<query><![CDATA[select count(position(URL, 'yandex')), count(position(URL, 'google')) FROM hits_100m_single]]></query>
<query><![CDATA[select count(multiPosition(URL, ['yandex', 'google'])) FROM hits_100m_single]]></query>
<query><![CDATA[select count(match(URL, 'yandex|google')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'yahoo')), sum(match(URL, 'pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'yahoo', 'pikabu'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|yahoo|pikabu')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'http'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|http')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex')), sum(match(URL, 'google')), sum(match(URL, 'facebook')), sum(match(URL, 'wikipedia')), sum(match(URL, 'reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(multiSearch(URL, ['yandex', 'google', 'facebook', 'wikipedia', 'reddit'])) from hits_100m_single]]></query>
<query><![CDATA[select sum(match(URL, 'yandex|google|facebook|wikipedia|reddit')) FROM hits_100m_single]]></query>
<query><![CDATA[select sum(firstMatch(URL, ['yandex', 'google', 'http', 'facebook', 'google'])) from hits_100m_single]]></query>
<main_metric>
<min_time/>
</main_metric>
</test>