Added hyperscan fuzzy search

This commit is contained in:
Danila Kutenin 2019-03-29 04:02:05 +03:00
parent 8f9c8defc5
commit ef0d622a5f
12 changed files with 1280 additions and 942 deletions

View File

@ -0,0 +1,945 @@
#include "FunctionsStringRegex.h"
#include "FunctionsStringSearch.h"
#include <Columns/ColumnFixedString.h>
#include <DataTypes/DataTypeFixedString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h>
#include <IO/WriteHelpers.h>
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <Poco/UTF8String.h>
#include <Common/Volnitsky.h>
#include <algorithm>
#include <memory>
#include <optional>
#include <string>
#include <vector>
#include <Common/config.h>
#if USE_HYPERSCAN
# if __has_include(<hs/hs.h>)
# include <hs/hs.h>
# else
# include <hs.h>
# endif
#endif
#if USE_RE2_ST
# include <re2_st/re2.h> // Y_IGNORE
#else
# define re2_st re2
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
extern const int TOO_MANY_BYTES;
extern const int NOT_IMPLEMENTED;
}
/// Is the LIKE expression reduced to finding a substring in a string?
inline bool likePatternIsStrstr(const String & pattern, String & res)
{
res = "";
if (pattern.size() < 2 || pattern.front() != '%' || pattern.back() != '%')
return false;
res.reserve(pattern.size() * 2);
const char * pos = pattern.data();
const char * end = pos + pattern.size();
++pos;
--end;
while (pos < end)
{
switch (*pos)
{
case '%':
case '_':
return false;
case '\\':
++pos;
if (pos == end)
return false;
else
res += *pos;
break;
default:
res += *pos;
break;
}
++pos;
}
return true;
}
/** 'like' - if true, treat pattern as SQL LIKE; if false - treat pattern as re2 regexp.
* NOTE: We want to run regexp search for whole block by one call (as implemented in function 'position')
* but for that, regexp engine must support \0 bytes and their interpretation as string boundaries.
*/
template <bool like, bool revert = false>
struct MatchImpl
{
using ResultType = UInt8;
static void vector_constant(
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
{
if (offsets.empty())
return;
String strstr_pattern;
/// A simple case where the LIKE expression reduces to finding a substring in a string
if (like && likePatternIsStrstr(pattern, strstr_pattern))
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
/// The current index in the array of strings.
size_t i = 0;
/// TODO You need to make that `searcher` is common to all the calls of the function.
Volnitsky searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
{
/// Let's determine which index it refers to.
while (begin + offsets[i] <= pos)
{
res[i] = revert;
++i;
}
/// We check that the entry does not pass through the boundaries of strings.
if (pos + strstr_pattern.size() < begin + offsets[i])
res[i] = !revert;
else
res[i] = revert;
pos = begin + offsets[i];
++i;
}
/// Tail, in which there can be no substring.
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
else
{
size_t size = offsets.size();
const auto & regexp = Regexps::get<like, true>(pattern);
std::string required_substring;
bool is_trivial;
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.
regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
if (required_substring.empty())
{
if (!regexp->getRE2()) /// An empty regexp. Always matches.
{
if (size)
memset(res.data(), 1, size * sizeof(res[0]));
}
else
{
size_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1),
0,
offsets[i] - prev_offset - 1,
re2_st::RE2::UNANCHORED,
nullptr,
0);
prev_offset = offsets[i];
}
}
}
else
{
/// NOTE This almost matches with the case of LikePatternIsStrstr.
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
/// The current index in the array of strings.
size_t i = 0;
Volnitsky searcher(required_substring.data(), required_substring.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
{
/// Determine which index it refers to.
while (begin + offsets[i] <= pos)
{
res[i] = revert;
++i;
}
/// We check that the entry does not pass through the boundaries of strings.
if (pos + strstr_pattern.size() < begin + offsets[i])
{
/// And if it does not, if necessary, we check the regexp.
if (is_trivial)
res[i] = !revert;
else
{
const char * str_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1] - 1;
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the string several times,
* and at the first occurrence, the regexp is not a match.
*/
if (required_substring_is_prefix)
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, str_size),
reinterpret_cast<const char *>(pos) - str_data,
str_size,
re2_st::RE2::UNANCHORED,
nullptr,
0);
else
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, str_size), 0, str_size, re2_st::RE2::UNANCHORED, nullptr, 0);
}
}
else
res[i] = revert;
pos = begin + offsets[i];
++i;
}
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
}
}
static void constant_constant(const std::string & data, const std::string & pattern, UInt8 & res)
{
const auto & regexp = Regexps::get<like, true>(pattern);
res = revert ^ regexp->match(data);
}
template <typename... Args>
static void vector_vector(Args &&...)
{
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}
/// Search different needles in single haystack.
template <typename... Args>
static void constant_vector(Args &&...)
{
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}
};
template <typename Type, bool FindAny, bool FindAnyIndex, bool MultiSearchDistance>
struct MultiMatchAnyImpl
{
static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1);
using ResultType = Type;
static constexpr bool is_using_hyperscan = true;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<Type> & res)
{
vector_constant(haystack_data, haystack_offsets, needles, res, std::nullopt);
}
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<Type> & res,
[[maybe_unused]] std::optional<UInt32> edit_distance)
{
(void)FindAny;
(void)FindAnyIndex;
#if USE_HYPERSCAN
const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex, MultiSearchDistance>(needles, edit_distance);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_alloc_scratch(hyperscan_regex->get(), &scratch);
if (err != HS_SUCCESS)
throw Exception("Could not allocate scratch space for hyperscan", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
MultiRegexps::ScratchPtr smart_scratch(scratch);
auto on_match = []([[maybe_unused]] unsigned int id,
unsigned long long /* from */,
unsigned long long /* to */,
unsigned int /* flags */,
void * context) -> int
{
if constexpr (FindAnyIndex)
*reinterpret_cast<Type *>(context) = id;
else if constexpr (FindAny)
*reinterpret_cast<Type *>(context) = 1;
return 0;
};
const size_t haystack_offsets_size = haystack_offsets.size();
UInt64 offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
UInt64 length = haystack_offsets[i] - offset - 1;
if (length > std::numeric_limits<UInt32>::max())
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
res[i] = 0;
hs_scan(
hyperscan_regex->get(),
reinterpret_cast<const char *>(haystack_data.data()) + offset,
length,
0,
smart_scratch.get(),
on_match,
&res[i]);
offset = haystack_offsets[i];
}
#else
/// Fallback if do not use hyperscan
if constexpr (MultiSearchDistance)
throw Exception(
"Edit distance multi-search is not implemented when hyperscan is off (is it Intel processor?)",
ErrorCodes::NOT_IMPLEMENTED);
PaddedPODArray<UInt8> accum(res.size());
memset(res.data(), 0, res.size() * sizeof(res.front()));
memset(accum.data(), 0, accum.size());
for (size_t j = 0; j < needles.size(); ++j)
{
MatchImpl<false, false>::vector_constant(haystack_data, haystack_offsets, needles[j].toString(), accum);
for (size_t i = 0; i < res.size(); ++i)
{
if constexpr (FindAny)
res[i] |= accum[i];
else if (FindAnyIndex && accum[i])
res[i] = j + 1;
}
}
#endif // USE_HYPERSCAN
}
};
struct ExtractImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & pattern,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size() / 5);
res_offsets.resize(offsets.size());
const auto & regexp = Regexps::get<false, false>(pattern);
unsigned capture = regexp->getNumberOfSubpatterns() > 0 ? 1 : 0;
OptimizedRegularExpression::MatchVec matches;
matches.reserve(capture + 1);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t cur_offset = offsets[i];
unsigned count
= regexp->match(reinterpret_cast<const char *>(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1);
if (count > capture && matches[capture].offset != std::string::npos)
{
const auto & match = matches[capture];
res_data.resize(res_offset + match.length + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[prev_offset + match.offset], match.length);
res_offset += match.length;
}
else
{
res_data.resize(res_offset + 1);
}
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
/** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants.
* 'replacement' could contain substitutions, for example: '\2-\3-\1'
*/
template <bool replace_one = false>
struct ReplaceRegexpImpl
{
/// Sequence of instructions, describing how to get resulting string.
/// Each element is either:
/// - substitution (in that case first element of pair is their number and second element is empty)
/// - string that need to be inserted (in that case, first element of pair is that string and second element is -1)
using Instructions = std::vector<std::pair<int, std::string>>;
static const size_t max_captures = 10;
static Instructions createInstructions(const std::string & s, int num_captures)
{
Instructions instructions;
String now = "";
for (size_t i = 0; i < s.size(); ++i)
{
if (s[i] == '\\' && i + 1 < s.size())
{
if (isNumericASCII(s[i + 1])) /// Substitution
{
if (!now.empty())
{
instructions.emplace_back(-1, now);
now = "";
}
instructions.emplace_back(s[i + 1] - '0', String());
}
else
now += s[i + 1]; /// Escaping
++i;
}
else
now += s[i]; /// Plain character
}
if (!now.empty())
{
instructions.emplace_back(-1, now);
now = "";
}
for (const auto & it : instructions)
if (it.first >= num_captures)
throw Exception(
"Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+ toString(num_captures - 1) + " subpatterns",
ErrorCodes::BAD_ARGUMENTS);
return instructions;
}
static void processString(
const re2_st::StringPiece & input,
ColumnString::Chars & res_data,
ColumnString::Offset & res_offset,
re2_st::RE2 & searcher,
int num_captures,
const Instructions & instructions)
{
re2_st::StringPiece matches[max_captures];
size_t start_pos = 0;
while (start_pos < static_cast<size_t>(input.length()))
{
/// If no more replacements possible for current string
bool can_finish_current_string = false;
if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures))
{
const auto & match = matches[0];
size_t bytes_to_copy = (match.data() - input.data()) - start_pos;
/// Copy prefix before matched regexp without modification
res_data.resize(res_data.size() + bytes_to_copy);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy);
res_offset += bytes_to_copy;
start_pos += bytes_to_copy + match.length();
/// Do substitution instructions
for (const auto & it : instructions)
{
if (it.first >= 0)
{
res_data.resize(res_data.size() + matches[it.first].length());
memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length());
res_offset += matches[it.first].length();
}
else
{
res_data.resize(res_data.size() + it.second.size());
memcpy(&res_data[res_offset], it.second.data(), it.second.size());
res_offset += it.second.size();
}
}
if (replace_one || match.length() == 0) /// Stop after match of zero length, to avoid infinite loop.
can_finish_current_string = true;
}
else
can_finish_current_string = true;
/// If ready, append suffix after match to end of string.
if (can_finish_current_string)
{
res_data.resize(res_data.size() + input.length() - start_pos);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos);
res_offset += input.length() - start_pos;
start_pos = input.length();
}
}
res_data.resize(res_data.size() + 1);
res_data[res_offset] = 0;
++res_offset;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
ColumnString::Offset res_offset = 0;
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
re2_st::RE2 searcher(needle);
int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures));
Instructions instructions = createInstructions(replacement, num_captures);
/// Cannot perform search for whole block. Will process each string separately.
for (size_t i = 0; i < size; ++i)
{
int from = i > 0 ? offsets[i - 1] : 0;
re2_st::StringPiece input(reinterpret_cast<const char *>(data.data() + from), offsets[i] - from - 1);
processString(input, res_data, res_offset, searcher, num_captures, instructions);
res_offsets[i] = res_offset;
}
}
static void vector_fixed(
const ColumnString::Chars & data,
size_t n,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
ColumnString::Offset res_offset = 0;
size_t size = data.size() / n;
res_data.reserve(data.size());
res_offsets.resize(size);
re2_st::RE2 searcher(needle);
int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures));
Instructions instructions = createInstructions(replacement, num_captures);
for (size_t i = 0; i < size; ++i)
{
int from = i * n;
re2_st::StringPiece input(reinterpret_cast<const char *>(data.data() + from), n);
processString(input, res_data, res_offset, searcher, num_captures, instructions);
res_offsets[i] = res_offset;
}
}
};
/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants.
*/
template <bool replace_one = false>
struct ReplaceStringImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
ColumnString::Offset res_offset = 0;
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
/// The current index in the array of strings.
size_t i = 0;
Volnitsky searcher(needle.data(), needle.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);
/// Copy the data without changing
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
/// Determine which index it belongs to.
while (i < offsets.size() && begin + offsets[i] <= match)
{
res_offsets[i] = res_offset + ((begin + offsets[i]) - pos);
++i;
}
res_offset += (match - pos);
/// If you have reached the end, it's time to stop
if (i == offsets.size())
break;
/// Is it true that this string no longer needs to perform transformations.
bool can_finish_current_string = false;
/// We check that the entry does not go through the boundaries of strings.
if (match + needle.size() < begin + offsets[i])
{
res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle.size();
if (replace_one)
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}
if (can_finish_current_string)
{
res_data.resize(res_data.size() + (begin + offsets[i] - pos));
memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos));
res_offset += (begin + offsets[i] - pos);
res_offsets[i] = res_offset;
pos = begin + offsets[i];
++i;
}
}
}
/// Note: this function converts fixed-length strings to variable-length strings
/// and each variable-length string should ends with zero byte.
static void vector_fixed(
const ColumnString::Chars & data,
size_t n,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
ColumnString::Offset res_offset = 0;
size_t count = data.size() / n;
res_data.reserve(data.size());
res_offsets.resize(count);
/// The current index in the string array.
size_t i = 0;
Volnitsky searcher(needle.data(), needle.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);
#define COPY_REST_OF_CURRENT_STRING() \
do \
{ \
const size_t len = begin + n * (i + 1) - pos; \
res_data.resize(res_data.size() + len + 1); \
memcpy(&res_data[res_offset], pos, len); \
res_offset += len; \
res_data[res_offset++] = 0; \
res_offsets[i] = res_offset; \
pos = begin + n * (i + 1); \
++i; \
} while (false)
/// Copy skipped strings without any changes but
/// add zero byte to the end of each string.
while (i < count && begin + n * (i + 1) <= match)
{
COPY_REST_OF_CURRENT_STRING();
}
/// If you have reached the end, it's time to stop
if (i == count)
break;
/// Copy unchanged part of current string.
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += (match - pos);
/// Is it true that this string no longer needs to perform conversions.
bool can_finish_current_string = false;
/// We check that the entry does not pass through the boundaries of strings.
if (match + needle.size() <= begin + n * (i + 1))
{
res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle.size();
if (replace_one || pos == begin + n * (i + 1))
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}
if (can_finish_current_string)
{
COPY_REST_OF_CURRENT_STRING();
}
#undef COPY_REST_OF_CURRENT_STRING
}
}
static void constant(const std::string & data, const std::string & needle, const std::string & replacement, std::string & res_data)
{
res_data = "";
int replace_cnt = 0;
for (size_t i = 0; i < data.size(); ++i)
{
bool match = true;
if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0))
match = false;
for (size_t j = 0; match && j < needle.size(); ++j)
if (data[i + j] != needle[j])
match = false;
if (match)
{
++replace_cnt;
res_data += replacement;
i = i + needle.size() - 1;
}
else
res_data += data[i];
}
}
};
template <typename Impl, typename Name>
class FunctionStringReplace : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionStringReplace>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 3; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isStringOrFixedString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[2]))
throw Exception(
"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
const ColumnPtr column_src = block.getByPosition(arguments[0]).column;
const ColumnPtr column_needle = block.getByPosition(arguments[1]).column;
const ColumnPtr column_replacement = block.getByPosition(arguments[2]).column;
if (!column_needle->isColumnConst() || !column_replacement->isColumnConst())
throw Exception("2nd and 3rd arguments of function " + getName() + " must be constants.", ErrorCodes::ILLEGAL_COLUMN);
const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
const IColumn * c2 = block.getByPosition(arguments[2]).column.get();
const ColumnConst * c1_const = typeid_cast<const ColumnConst *>(c1);
const ColumnConst * c2_const = typeid_cast<const ColumnConst *>(c2);
String needle = c1_const->getValue<String>();
String replacement = c2_const->getValue<String>();
if (needle.size() == 0)
throw Exception("Length of the second argument of function replace must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, col_res->getChars(), col_res->getOffsets());
block.getByPosition(result).column = std::move(col_res);
}
else if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vector_fixed(col_fixed->getChars(), col_fixed->getN(), needle, replacement, col_res->getChars(), col_res->getOffsets());
block.getByPosition(result).column = std::move(col_res);
}
else
throw Exception(
"Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
};
struct NameMatch
{
static constexpr auto name = "match";
};
struct NameLike
{
static constexpr auto name = "like";
};
struct NameNotLike
{
static constexpr auto name = "notLike";
};
struct NameMultiMatchAny
{
static constexpr auto name = "multiMatchAny";
};
struct NameMultiMatchAnyIndex
{
static constexpr auto name = "multiMatchAnyIndex";
};
/// FunctionsStringSimilarity or Regex?
struct NameMultiMatchAnyEditDistance
{
static constexpr auto name = "multiMatchAnyEditDistance";
};
struct NameMultiMatchAnyIndexEditDistance
{
static constexpr auto name = "multiMatchAnyIndexEditDistance";
};
struct NameExtract
{
static constexpr auto name = "extract";
};
struct NameReplaceOne
{
static constexpr auto name = "replaceOne";
};
struct NameReplaceAll
{
static constexpr auto name = "replaceAll";
};
struct NameReplaceRegexpOne
{
static constexpr auto name = "replaceRegexpOne";
};
struct NameReplaceRegexpAll
{
static constexpr auto name = "replaceRegexpAll";
};
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionMultiMatchAny = FunctionsMultiStringSearch<
MultiMatchAnyImpl<UInt8, true, false, false>,
NameMultiMatchAny,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<
MultiMatchAnyImpl<UInt64, false, true, false>,
NameMultiMatchAnyIndex,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyEditDistance = FunctionsMultiStringSearchEditDistance<
MultiMatchAnyImpl<UInt8, true, false, true>,
NameMultiMatchAnyEditDistance,
std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndexEditDistance = FunctionsMultiStringSearchEditDistance<
MultiMatchAnyImpl<UInt64, false, true, true>,
NameMultiMatchAnyIndexEditDistance,
std::numeric_limits<UInt32>::max()>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
using FunctionReplaceOne = FunctionStringReplace<ReplaceStringImpl<true>, NameReplaceOne>;
using FunctionReplaceAll = FunctionStringReplace<ReplaceStringImpl<false>, NameReplaceAll>;
using FunctionReplaceRegexpOne = FunctionStringReplace<ReplaceRegexpImpl<true>, NameReplaceRegexpOne>;
using FunctionReplaceRegexpAll = FunctionStringReplace<ReplaceRegexpImpl<false>, NameReplaceRegexpAll>;
void registerFunctionsStringRegex(FunctionFactory & factory)
{
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>();
factory.registerFunction<FunctionReplaceOne>();
factory.registerFunction<FunctionReplaceAll>();
factory.registerFunction<FunctionReplaceRegexpOne>();
factory.registerFunction<FunctionReplaceRegexpAll>();
factory.registerFunction<FunctionMultiMatchAny>();
factory.registerFunction<FunctionMultiMatchAnyIndex>();
factory.registerFunction<FunctionMultiMatchAnyEditDistance>();
factory.registerFunction<FunctionMultiMatchAnyIndexEditDistance>();
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
}
}

View File

@ -0,0 +1,136 @@
#pragma once
#include <Columns/ColumnArray.h>
#include <Columns/ColumnConst.h>
#include <Columns/ColumnString.h>
#include <Columns/ColumnVector.h>
#include <Columns/ColumnsNumber.h>
#include <Core/Field.h>
#include <DataTypes/DataTypeArray.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
#include <Functions/FunctionHelpers.h>
#include <Functions/IFunction.h>
#include <IO/WriteHelpers.h>
#include <Interpreters/Context.h>
#include <common/StringRef.h>
#include <optional>
namespace DB
{
namespace ErrorCodes
{
extern const int ILLEGAL_TYPE_OF_ARGUMENT;
extern const int ILLEGAL_COLUMN;
extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
extern const int FUNCTION_NOT_ALLOWED;
}
template <typename Impl, typename Name, size_t LimitArgs>
class FunctionsMultiStringSearchEditDistance : public IFunction
{
static_assert(LimitArgs > 0);
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context & context)
{
if (Impl::is_using_hyperscan && !context.getSettingsRef().allow_hyperscan)
throw Exception(
"Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
return std::make_shared<FunctionsMultiStringSearchEditDistance>();
}
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 3; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isUnsignedInteger(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
const DataTypeArray * array_type = checkAndGetDataType<DataTypeArray>(arguments[2].get());
if (!array_type || !checkAndGetDataType<DataTypeString>(array_type->getNestedType().get()))
throw Exception(
"Illegal type " + arguments[2]->getName() + " of argument of function " + getName(), ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeNumber<typename Impl::ResultType>>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
using ResultType = typename Impl::ResultType;
const ColumnPtr & column_haystack = block.getByPosition(arguments[0]).column;
const ColumnString * col_haystack_vector = checkAndGetColumn<ColumnString>(&*column_haystack);
const ColumnPtr & num_ptr = block.getByPosition(arguments[1]).column;
const ColumnConst * col_const_num = nullptr;
UInt32 edit_distance = 0;
if ((col_const_num = checkAndGetColumnConst<ColumnUInt8>(num_ptr.get())))
edit_distance = col_const_num->getValue<UInt8>();
else if ((col_const_num = checkAndGetColumnConst<ColumnUInt16>(num_ptr.get())))
edit_distance = col_const_num->getValue<UInt16>();
else if ((col_const_num = checkAndGetColumnConst<ColumnUInt32>(num_ptr.get())))
edit_distance = col_const_num->getValue<UInt32>();
else
throw Exception(
"Illegal column " + block.getByPosition(arguments[1]).column->getName()
+ ". The number is not const or does not fit in UInt32",
ErrorCodes::ILLEGAL_COLUMN);
const ColumnPtr & arr_ptr = block.getByPosition(arguments[2]).column;
const ColumnConst * col_const_arr = checkAndGetColumnConst<ColumnArray>(arr_ptr.get());
if (!col_const_arr)
throw Exception(
"Illegal column " + block.getByPosition(arguments[2]).column->getName() + ". The array is not const",
ErrorCodes::ILLEGAL_COLUMN);
Array src_arr = col_const_arr->getValue<Array>();
if (src_arr.size() > LimitArgs)
throw Exception(
"Number of arguments for function " + getName() + " doesn't match: passed " + std::to_string(src_arr.size())
+ ", should be at most " + std::to_string(LimitArgs),
ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
std::vector<StringRef> refs;
refs.reserve(src_arr.size());
for (const auto & el : src_arr)
refs.emplace_back(el.get<String>());
const size_t column_haystack_size = column_haystack->size();
auto col_res = ColumnVector<ResultType>::create();
auto & vec_res = col_res->getData();
vec_res.resize(column_haystack_size);
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res, edit_distance);
else
throw Exception("Illegal column " + block.getByPosition(arguments[0]).column->getName(), ErrorCodes::ILLEGAL_COLUMN);
block.getByPosition(result).column = std::move(col_res);
}
};
}

View File

@ -1,42 +1,19 @@
#include "FunctionsStringSearch.h"
#include <algorithm>
#include <memory>
#include <string>
#include <vector>
#include <Columns/ColumnFixedString.h>
#include <DataTypes/DataTypeFixedString.h>
#include <Functions/FunctionFactory.h>
#include <Functions/Regexps.h>
#include <IO/WriteHelpers.h>
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <Poco/UTF8String.h>
#include <Common/Volnitsky.h>
#include <algorithm>
#include <memory>
#include <Common/config.h>
#if USE_HYPERSCAN
# if __has_include(<hs/hs.h>)
# include <hs/hs.h>
# else
# include <hs.h>
# endif
#endif
#if USE_RE2_ST
# include <re2_st/re2.h> // Y_IGNORE
#else
# define re2_st re2
#endif
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
extern const int ILLEGAL_COLUMN;
extern const int TOO_MANY_BYTES;
}
/** Implementation details for functions of 'position' family depending on ASCII/UTF8 and case sensitiveness.
*/
struct PositionCaseSensitiveASCII
@ -329,8 +306,7 @@ struct MultiSearchAllPositionsImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 {
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchAllPositions(haystack_data, haystack_offsets, res_callback, res);
@ -365,8 +341,7 @@ struct MultiSearchFirstPositionImpl
const std::vector<StringRef> & needles,
PaddedPODArray<UInt64> & res)
{
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64
{
auto res_callback = [](const UInt8 * start, const UInt8 * end) -> UInt64 {
return 1 + Impl::countChars(reinterpret_cast<const char *>(start), reinterpret_cast<const char *>(end));
};
Impl::createMultiSearcherInBigHaystack(needles).searchFirstPosition(haystack_data, haystack_offsets, res_callback, res);
@ -390,794 +365,6 @@ struct MultiSearchFirstIndexImpl
};
/// Is the LIKE expression reduced to finding a substring in a string?
inline bool likePatternIsStrstr(const String & pattern, String & res)
{
res = "";
if (pattern.size() < 2 || pattern.front() != '%' || pattern.back() != '%')
return false;
res.reserve(pattern.size() * 2);
const char * pos = pattern.data();
const char * end = pos + pattern.size();
++pos;
--end;
while (pos < end)
{
switch (*pos)
{
case '%':
case '_':
return false;
case '\\':
++pos;
if (pos == end)
return false;
else
res += *pos;
break;
default:
res += *pos;
break;
}
++pos;
}
return true;
}
/** 'like' - if true, treat pattern as SQL LIKE; if false - treat pattern as re2 regexp.
* NOTE: We want to run regexp search for whole block by one call (as implemented in function 'position')
* but for that, regexp engine must support \0 bytes and their interpretation as string boundaries.
*/
template <bool like, bool revert = false>
struct MatchImpl
{
using ResultType = UInt8;
static void vector_constant(
const ColumnString::Chars & data, const ColumnString::Offsets & offsets, const std::string & pattern, PaddedPODArray<UInt8> & res)
{
if (offsets.empty())
return;
String strstr_pattern;
/// A simple case where the LIKE expression reduces to finding a substring in a string
if (like && likePatternIsStrstr(pattern, strstr_pattern))
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
/// The current index in the array of strings.
size_t i = 0;
/// TODO You need to make that `searcher` is common to all the calls of the function.
Volnitsky searcher(strstr_pattern.data(), strstr_pattern.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
{
/// Let's determine which index it refers to.
while (begin + offsets[i] <= pos)
{
res[i] = revert;
++i;
}
/// We check that the entry does not pass through the boundaries of strings.
if (pos + strstr_pattern.size() < begin + offsets[i])
res[i] = !revert;
else
res[i] = revert;
pos = begin + offsets[i];
++i;
}
/// Tail, in which there can be no substring.
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
else
{
size_t size = offsets.size();
const auto & regexp = Regexps::get<like, true>(pattern);
std::string required_substring;
bool is_trivial;
bool required_substring_is_prefix; /// for `anchored` execution of the regexp.
regexp->getAnalyzeResult(required_substring, is_trivial, required_substring_is_prefix);
if (required_substring.empty())
{
if (!regexp->getRE2()) /// An empty regexp. Always matches.
{
if (size)
memset(res.data(), 1, size * sizeof(res[0]));
}
else
{
size_t prev_offset = 0;
for (size_t i = 0; i < size; ++i)
{
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(reinterpret_cast<const char *>(&data[prev_offset]), offsets[i] - prev_offset - 1),
0,
offsets[i] - prev_offset - 1,
re2_st::RE2::UNANCHORED,
nullptr,
0);
prev_offset = offsets[i];
}
}
}
else
{
/// NOTE This almost matches with the case of LikePatternIsStrstr.
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
/// The current index in the array of strings.
size_t i = 0;
Volnitsky searcher(required_substring.data(), required_substring.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end && end != (pos = searcher.search(pos, end - pos)))
{
/// Determine which index it refers to.
while (begin + offsets[i] <= pos)
{
res[i] = revert;
++i;
}
/// We check that the entry does not pass through the boundaries of strings.
if (pos + strstr_pattern.size() < begin + offsets[i])
{
/// And if it does not, if necessary, we check the regexp.
if (is_trivial)
res[i] = !revert;
else
{
const char * str_data = reinterpret_cast<const char *>(&data[offsets[i - 1]]);
size_t str_size = offsets[i] - offsets[i - 1] - 1;
/** Even in the case of `required_substring_is_prefix` use UNANCHORED check for regexp,
* so that it can match when `required_substring` occurs into the string several times,
* and at the first occurrence, the regexp is not a match.
*/
if (required_substring_is_prefix)
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, str_size),
reinterpret_cast<const char *>(pos) - str_data,
str_size,
re2_st::RE2::UNANCHORED,
nullptr,
0);
else
res[i] = revert
^ regexp->getRE2()->Match(
re2_st::StringPiece(str_data, str_size), 0, str_size, re2_st::RE2::UNANCHORED, nullptr, 0);
}
}
else
res[i] = revert;
pos = begin + offsets[i];
++i;
}
if (i < res.size())
memset(&res[i], revert, (res.size() - i) * sizeof(res[0]));
}
}
}
static void constant_constant(const std::string & data, const std::string & pattern, UInt8 & res)
{
const auto & regexp = Regexps::get<like, true>(pattern);
res = revert ^ regexp->match(data);
}
template <typename... Args>
static void vector_vector(Args &&...)
{
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}
/// Search different needles in single haystack.
template <typename... Args>
static void constant_vector(Args &&...)
{
throw Exception("Functions 'like' and 'match' don't support non-constant needle argument", ErrorCodes::ILLEGAL_COLUMN);
}
};
template <typename Type, bool FindAny, bool FindAnyIndex>
struct MultiMatchAnyImpl
{
static_assert(static_cast<int>(FindAny) + static_cast<int>(FindAnyIndex) == 1);
using ResultType = Type;
static constexpr bool is_using_hyperscan = true;
static void vector_constant(
const ColumnString::Chars & haystack_data,
const ColumnString::Offsets & haystack_offsets,
const std::vector<StringRef> & needles,
PaddedPODArray<Type> & res)
{
(void)FindAny;
(void)FindAnyIndex;
#if USE_HYPERSCAN
using ScratchPtr = std::unique_ptr<hs_scratch_t, DB::MultiRegexps::HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
const auto & hyperscan_regex = MultiRegexps::get<FindAnyIndex>(needles);
hs_scratch_t * scratch = nullptr;
hs_error_t err = hs_alloc_scratch(hyperscan_regex->get(), &scratch);
if (err != HS_SUCCESS)
throw Exception("Could not allocate scratch space for hyperscan.", ErrorCodes::CANNOT_ALLOCATE_MEMORY);
ScratchPtr smart_scratch(scratch);
auto on_match = []([[maybe_unused]] unsigned int id,
unsigned long long /* from */,
unsigned long long /* to */,
unsigned int /* flags */,
void * context) -> int
{
if constexpr (FindAnyIndex)
*reinterpret_cast<Type *>(context) = id;
else if constexpr (FindAny)
*reinterpret_cast<Type *>(context) = 1;
return 0;
};
const size_t haystack_offsets_size = haystack_offsets.size();
UInt64 offset = 0;
for (size_t i = 0; i < haystack_offsets_size; ++i)
{
UInt64 length = haystack_offsets[i] - offset - 1;
if (length > std::numeric_limits<UInt32>::max())
throw Exception("Too long string to search", ErrorCodes::TOO_MANY_BYTES);
res[i] = 0;
hs_scan(
hyperscan_regex->get(),
reinterpret_cast<const char *>(haystack_data.data()) + offset,
length,
0,
smart_scratch.get(),
on_match,
&res[i]);
offset = haystack_offsets[i];
}
#else
/// Fallback if do not use hyperscan
PaddedPODArray<UInt8> accum(res.size());
memset(res.data(), 0, res.size() * sizeof(res.front()));
memset(accum.data(), 0, accum.size());
for (size_t j = 0; j < needles.size(); ++j)
{
MatchImpl<false, false>::vector_constant(haystack_data, haystack_offsets, needles[j].toString(), accum);
for (size_t i = 0; i < res.size(); ++i)
{
if constexpr (FindAny)
res[i] |= accum[i];
else if (accum[i])
res[i] = j + 1;
}
}
#endif // USE_HYPERSCAN
}
};
struct ExtractImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & pattern,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
res_data.reserve(data.size() / 5);
res_offsets.resize(offsets.size());
const auto & regexp = Regexps::get<false, false>(pattern);
unsigned capture = regexp->getNumberOfSubpatterns() > 0 ? 1 : 0;
OptimizedRegularExpression::MatchVec matches;
matches.reserve(capture + 1);
size_t prev_offset = 0;
size_t res_offset = 0;
for (size_t i = 0; i < offsets.size(); ++i)
{
size_t cur_offset = offsets[i];
unsigned count
= regexp->match(reinterpret_cast<const char *>(&data[prev_offset]), cur_offset - prev_offset - 1, matches, capture + 1);
if (count > capture && matches[capture].offset != std::string::npos)
{
const auto & match = matches[capture];
res_data.resize(res_offset + match.length + 1);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], &data[prev_offset + match.offset], match.length);
res_offset += match.length;
}
else
{
res_data.resize(res_offset + 1);
}
res_data[res_offset] = 0;
++res_offset;
res_offsets[i] = res_offset;
prev_offset = cur_offset;
}
}
};
/** Replace all matches of regexp 'needle' to string 'replacement'. 'needle' and 'replacement' are constants.
* 'replacement' could contain substitutions, for example: '\2-\3-\1'
*/
template <bool replace_one = false>
struct ReplaceRegexpImpl
{
/// Sequence of instructions, describing how to get resulting string.
/// Each element is either:
/// - substitution (in that case first element of pair is their number and second element is empty)
/// - string that need to be inserted (in that case, first element of pair is that string and second element is -1)
using Instructions = std::vector<std::pair<int, std::string>>;
static const size_t max_captures = 10;
static Instructions createInstructions(const std::string & s, int num_captures)
{
Instructions instructions;
String now = "";
for (size_t i = 0; i < s.size(); ++i)
{
if (s[i] == '\\' && i + 1 < s.size())
{
if (isNumericASCII(s[i + 1])) /// Substitution
{
if (!now.empty())
{
instructions.emplace_back(-1, now);
now = "";
}
instructions.emplace_back(s[i + 1] - '0', String());
}
else
now += s[i + 1]; /// Escaping
++i;
}
else
now += s[i]; /// Plain character
}
if (!now.empty())
{
instructions.emplace_back(-1, now);
now = "";
}
for (const auto & it : instructions)
if (it.first >= num_captures)
throw Exception(
"Invalid replace instruction in replacement string. Id: " + toString(it.first) + ", but regexp has only "
+ toString(num_captures - 1) + " subpatterns",
ErrorCodes::BAD_ARGUMENTS);
return instructions;
}
static void processString(
const re2_st::StringPiece & input,
ColumnString::Chars & res_data,
ColumnString::Offset & res_offset,
re2_st::RE2 & searcher,
int num_captures,
const Instructions & instructions)
{
re2_st::StringPiece matches[max_captures];
size_t start_pos = 0;
while (start_pos < static_cast<size_t>(input.length()))
{
/// If no more replacements possible for current string
bool can_finish_current_string = false;
if (searcher.Match(input, start_pos, input.length(), re2_st::RE2::Anchor::UNANCHORED, matches, num_captures))
{
const auto & match = matches[0];
size_t bytes_to_copy = (match.data() - input.data()) - start_pos;
/// Copy prefix before matched regexp without modification
res_data.resize(res_data.size() + bytes_to_copy);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, bytes_to_copy);
res_offset += bytes_to_copy;
start_pos += bytes_to_copy + match.length();
/// Do substitution instructions
for (const auto & it : instructions)
{
if (it.first >= 0)
{
res_data.resize(res_data.size() + matches[it.first].length());
memcpy(&res_data[res_offset], matches[it.first].data(), matches[it.first].length());
res_offset += matches[it.first].length();
}
else
{
res_data.resize(res_data.size() + it.second.size());
memcpy(&res_data[res_offset], it.second.data(), it.second.size());
res_offset += it.second.size();
}
}
if (replace_one || match.length() == 0) /// Stop after match of zero length, to avoid infinite loop.
can_finish_current_string = true;
}
else
can_finish_current_string = true;
/// If ready, append suffix after match to end of string.
if (can_finish_current_string)
{
res_data.resize(res_data.size() + input.length() - start_pos);
memcpySmallAllowReadWriteOverflow15(&res_data[res_offset], input.data() + start_pos, input.length() - start_pos);
res_offset += input.length() - start_pos;
start_pos = input.length();
}
}
res_data.resize(res_data.size() + 1);
res_data[res_offset] = 0;
++res_offset;
}
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
ColumnString::Offset res_offset = 0;
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
re2_st::RE2 searcher(needle);
int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures));
Instructions instructions = createInstructions(replacement, num_captures);
/// Cannot perform search for whole block. Will process each string separately.
for (size_t i = 0; i < size; ++i)
{
int from = i > 0 ? offsets[i - 1] : 0;
re2_st::StringPiece input(reinterpret_cast<const char *>(data.data() + from), offsets[i] - from - 1);
processString(input, res_data, res_offset, searcher, num_captures, instructions);
res_offsets[i] = res_offset;
}
}
static void vector_fixed(
const ColumnString::Chars & data,
size_t n,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
ColumnString::Offset res_offset = 0;
size_t size = data.size() / n;
res_data.reserve(data.size());
res_offsets.resize(size);
re2_st::RE2 searcher(needle);
int num_captures = std::min(searcher.NumberOfCapturingGroups() + 1, static_cast<int>(max_captures));
Instructions instructions = createInstructions(replacement, num_captures);
for (size_t i = 0; i < size; ++i)
{
int from = i * n;
re2_st::StringPiece input(reinterpret_cast<const char *>(data.data() + from), n);
processString(input, res_data, res_offset, searcher, num_captures, instructions);
res_offsets[i] = res_offset;
}
}
};
/** Replace one or all occurencies of substring 'needle' to 'replacement'. 'needle' and 'replacement' are constants.
*/
template <bool replace_one = false>
struct ReplaceStringImpl
{
static void vector(
const ColumnString::Chars & data,
const ColumnString::Offsets & offsets,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
ColumnString::Offset res_offset = 0;
res_data.reserve(data.size());
size_t size = offsets.size();
res_offsets.resize(size);
/// The current index in the array of strings.
size_t i = 0;
Volnitsky searcher(needle.data(), needle.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);
/// Copy the data without changing
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
/// Determine which index it belongs to.
while (i < offsets.size() && begin + offsets[i] <= match)
{
res_offsets[i] = res_offset + ((begin + offsets[i]) - pos);
++i;
}
res_offset += (match - pos);
/// If you have reached the end, it's time to stop
if (i == offsets.size())
break;
/// Is it true that this string no longer needs to perform transformations.
bool can_finish_current_string = false;
/// We check that the entry does not go through the boundaries of strings.
if (match + needle.size() < begin + offsets[i])
{
res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle.size();
if (replace_one)
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}
if (can_finish_current_string)
{
res_data.resize(res_data.size() + (begin + offsets[i] - pos));
memcpy(&res_data[res_offset], pos, (begin + offsets[i] - pos));
res_offset += (begin + offsets[i] - pos);
res_offsets[i] = res_offset;
pos = begin + offsets[i];
++i;
}
}
}
/// Note: this function converts fixed-length strings to variable-length strings
/// and each variable-length string should ends with zero byte.
static void vector_fixed(
const ColumnString::Chars & data,
size_t n,
const std::string & needle,
const std::string & replacement,
ColumnString::Chars & res_data,
ColumnString::Offsets & res_offsets)
{
const UInt8 * begin = data.data();
const UInt8 * pos = begin;
const UInt8 * end = pos + data.size();
ColumnString::Offset res_offset = 0;
size_t count = data.size() / n;
res_data.reserve(data.size());
res_offsets.resize(count);
/// The current index in the string array.
size_t i = 0;
Volnitsky searcher(needle.data(), needle.size(), end - pos);
/// We will search for the next occurrence in all rows at once.
while (pos < end)
{
const UInt8 * match = searcher.search(pos, end - pos);
#define COPY_REST_OF_CURRENT_STRING() \
do \
{ \
const size_t len = begin + n * (i + 1) - pos; \
res_data.resize(res_data.size() + len + 1); \
memcpy(&res_data[res_offset], pos, len); \
res_offset += len; \
res_data[res_offset++] = 0; \
res_offsets[i] = res_offset; \
pos = begin + n * (i + 1); \
++i; \
} while (false)
/// Copy skipped strings without any changes but
/// add zero byte to the end of each string.
while (i < count && begin + n * (i + 1) <= match)
{
COPY_REST_OF_CURRENT_STRING();
}
/// If you have reached the end, it's time to stop
if (i == count)
break;
/// Copy unchanged part of current string.
res_data.resize(res_data.size() + (match - pos));
memcpy(&res_data[res_offset], pos, match - pos);
res_offset += (match - pos);
/// Is it true that this string no longer needs to perform conversions.
bool can_finish_current_string = false;
/// We check that the entry does not pass through the boundaries of strings.
if (match + needle.size() <= begin + n * (i + 1))
{
res_data.resize(res_data.size() + replacement.size());
memcpy(&res_data[res_offset], replacement.data(), replacement.size());
res_offset += replacement.size();
pos = match + needle.size();
if (replace_one || pos == begin + n * (i + 1))
can_finish_current_string = true;
}
else
{
pos = match;
can_finish_current_string = true;
}
if (can_finish_current_string)
{
COPY_REST_OF_CURRENT_STRING();
}
#undef COPY_REST_OF_CURRENT_STRING
}
}
static void constant(const std::string & data, const std::string & needle, const std::string & replacement, std::string & res_data)
{
res_data = "";
int replace_cnt = 0;
for (size_t i = 0; i < data.size(); ++i)
{
bool match = true;
if (i + needle.size() > data.size() || (replace_one && replace_cnt > 0))
match = false;
for (size_t j = 0; match && j < needle.size(); ++j)
if (data[i + j] != needle[j])
match = false;
if (match)
{
++replace_cnt;
res_data += replacement;
i = i + needle.size() - 1;
}
else
res_data += data[i];
}
}
};
template <typename Impl, typename Name>
class FunctionStringReplace : public IFunction
{
public:
static constexpr auto name = Name::name;
static FunctionPtr create(const Context &) { return std::make_shared<FunctionStringReplace>(); }
String getName() const override { return name; }
size_t getNumberOfArguments() const override { return 3; }
bool useDefaultImplementationForConstants() const override { return true; }
ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {1, 2}; }
DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
{
if (!isStringOrFixedString(arguments[0]))
throw Exception(
"Illegal type " + arguments[0]->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[1]))
throw Exception(
"Illegal type " + arguments[1]->getName() + " of second argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
if (!isStringOrFixedString(arguments[2]))
throw Exception(
"Illegal type " + arguments[2]->getName() + " of third argument of function " + getName(),
ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);
return std::make_shared<DataTypeString>();
}
void executeImpl(Block & block, const ColumnNumbers & arguments, size_t result, size_t /*input_rows_count*/) override
{
const ColumnPtr column_src = block.getByPosition(arguments[0]).column;
const ColumnPtr column_needle = block.getByPosition(arguments[1]).column;
const ColumnPtr column_replacement = block.getByPosition(arguments[2]).column;
if (!column_needle->isColumnConst() || !column_replacement->isColumnConst())
throw Exception("2nd and 3rd arguments of function " + getName() + " must be constants.", ErrorCodes::ILLEGAL_COLUMN);
const IColumn * c1 = block.getByPosition(arguments[1]).column.get();
const IColumn * c2 = block.getByPosition(arguments[2]).column.get();
const ColumnConst * c1_const = typeid_cast<const ColumnConst *>(c1);
const ColumnConst * c2_const = typeid_cast<const ColumnConst *>(c2);
String needle = c1_const->getValue<String>();
String replacement = c2_const->getValue<String>();
if (needle.size() == 0)
throw Exception("Length of the second argument of function replace must be greater than 0.", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
if (const ColumnString * col = checkAndGetColumn<ColumnString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vector(col->getChars(), col->getOffsets(), needle, replacement, col_res->getChars(), col_res->getOffsets());
block.getByPosition(result).column = std::move(col_res);
}
else if (const ColumnFixedString * col_fixed = checkAndGetColumn<ColumnFixedString>(column_src.get()))
{
auto col_res = ColumnString::create();
Impl::vector_fixed(col_fixed->getChars(), col_fixed->getN(), needle, replacement, col_res->getChars(), col_res->getOffsets());
block.getByPosition(result).column = std::move(col_res);
}
else
throw Exception(
"Illegal column " + block.getByPosition(arguments[0]).column->getName() + " of first argument of function " + getName(),
ErrorCodes::ILLEGAL_COLUMN);
}
};
struct NamePosition
{
static constexpr auto name = "position";
@ -1258,46 +445,7 @@ struct NameMultiSearchFirstPositionCaseInsensitiveUTF8
{
static constexpr auto name = "multiSearchFirstPositionCaseInsensitiveUTF8";
};
struct NameMatch
{
static constexpr auto name = "match";
};
struct NameLike
{
static constexpr auto name = "like";
};
struct NameNotLike
{
static constexpr auto name = "notLike";
};
struct NameMultiMatchAny
{
static constexpr auto name = "multiMatchAny";
};
struct NameMultiMatchAnyIndex
{
static constexpr auto name = "multiMatchAnyIndex";
};
struct NameExtract
{
static constexpr auto name = "extract";
};
struct NameReplaceOne
{
static constexpr auto name = "replaceOne";
};
struct NameReplaceAll
{
static constexpr auto name = "replaceAll";
};
struct NameReplaceRegexpOne
{
static constexpr auto name = "replaceRegexpOne";
};
struct NameReplaceRegexpAll
{
static constexpr auto name = "replaceRegexpAll";
};
using FunctionPosition = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveASCII>, NamePosition>;
using FunctionPositionUTF8 = FunctionsStringSearch<PositionImpl<PositionCaseSensitiveUTF8>, NamePositionUTF8>;
@ -1305,12 +453,15 @@ using FunctionPositionCaseInsensitive = FunctionsStringSearch<PositionImpl<Posit
using FunctionPositionCaseInsensitiveUTF8
= FunctionsStringSearch<PositionImpl<PositionCaseInsensitiveUTF8>, NamePositionCaseInsensitiveUTF8>;
using FunctionMultiSearchAllPositions = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
using FunctionMultiSearchAllPositionsUTF8 = FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
using FunctionMultiSearchAllPositions
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveASCII>, NameMultiSearchAllPositions>;
using FunctionMultiSearchAllPositionsUTF8
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAllPositionsUTF8>;
using FunctionMultiSearchAllPositionsCaseInsensitive
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveASCII>, NameMultiSearchAllPositionsCaseInsensitive>;
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8
= FunctionsMultiStringPosition<MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
using FunctionMultiSearchAllPositionsCaseInsensitiveUTF8 = FunctionsMultiStringPosition<
MultiSearchAllPositionsImpl<PositionCaseInsensitiveUTF8>,
NameMultiSearchAllPositionsCaseInsensitiveUTF8>;
using FunctionMultiSearch = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveASCII>, NameMultiSearchAny>;
using FunctionMultiSearchUTF8 = FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseSensitiveUTF8>, NameMultiSearchAnyUTF8>;
@ -1319,39 +470,28 @@ using FunctionMultiSearchCaseInsensitive
using FunctionMultiSearchCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchAnyCaseInsensitiveUTF8>;
using FunctionMultiSearchFirstIndex = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
using FunctionMultiSearchFirstIndexUTF8 = FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
using FunctionMultiSearchFirstIndex
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstIndex>;
using FunctionMultiSearchFirstIndexUTF8
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstIndexUTF8>;
using FunctionMultiSearchFirstIndexCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstIndexCaseInsensitive>;
using FunctionMultiSearchFirstIndexCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchFirstIndexImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstIndexCaseInsensitiveUTF8>;
using FunctionMultiSearchFirstPosition = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
using FunctionMultiSearchFirstPositionUTF8 = FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
using FunctionMultiSearchFirstPosition
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveASCII>, NameMultiSearchFirstPosition>;
using FunctionMultiSearchFirstPositionUTF8
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseSensitiveUTF8>, NameMultiSearchFirstPositionUTF8>;
using FunctionMultiSearchFirstPositionCaseInsensitive
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveASCII>, NameMultiSearchFirstPositionCaseInsensitive>;
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8
= FunctionsMultiStringSearch<MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>, NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
using FunctionMatch = FunctionsStringSearch<MatchImpl<false>, NameMatch>;
using FunctionMultiMatchAny = FunctionsMultiStringSearch<MultiMatchAnyImpl<UInt8, true, false>, NameMultiMatchAny, std::numeric_limits<UInt32>::max()>;
using FunctionMultiMatchAnyIndex = FunctionsMultiStringSearch<MultiMatchAnyImpl<UInt64, false, true>, NameMultiMatchAnyIndex, std::numeric_limits<UInt32>::max()>;
using FunctionLike = FunctionsStringSearch<MatchImpl<true>, NameLike>;
using FunctionNotLike = FunctionsStringSearch<MatchImpl<true, true>, NameNotLike>;
using FunctionExtract = FunctionsStringSearchToString<ExtractImpl, NameExtract>;
using FunctionReplaceOne = FunctionStringReplace<ReplaceStringImpl<true>, NameReplaceOne>;
using FunctionReplaceAll = FunctionStringReplace<ReplaceStringImpl<false>, NameReplaceAll>;
using FunctionReplaceRegexpOne = FunctionStringReplace<ReplaceRegexpImpl<true>, NameReplaceRegexpOne>;
using FunctionReplaceRegexpAll = FunctionStringReplace<ReplaceRegexpImpl<false>, NameReplaceRegexpAll>;
using FunctionMultiSearchFirstPositionCaseInsensitiveUTF8 = FunctionsMultiStringSearch<
MultiSearchFirstPositionImpl<PositionCaseInsensitiveUTF8>,
NameMultiSearchFirstPositionCaseInsensitiveUTF8>;
void registerFunctionsStringSearch(FunctionFactory & factory)
{
factory.registerFunction<FunctionReplaceOne>();
factory.registerFunction<FunctionReplaceAll>();
factory.registerFunction<FunctionReplaceRegexpOne>();
factory.registerFunction<FunctionReplaceRegexpAll>();
factory.registerFunction<FunctionPosition>(FunctionFactory::CaseInsensitive);
factory.registerFunction<FunctionPositionUTF8>();
factory.registerFunction<FunctionPositionCaseInsensitive>();
@ -1377,15 +517,6 @@ void registerFunctionsStringSearch(FunctionFactory & factory)
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitive>();
factory.registerFunction<FunctionMultiSearchFirstPositionCaseInsensitiveUTF8>();
factory.registerFunction<FunctionMatch>();
factory.registerFunction<FunctionLike>();
factory.registerFunction<FunctionNotLike>();
factory.registerFunction<FunctionExtract>();
factory.registerFunction<FunctionMultiMatchAny>();
factory.registerFunction<FunctionMultiMatchAnyIndex>();
factory.registerAlias("locate", NamePosition::name, FunctionFactory::CaseInsensitive);
factory.registerAlias("replace", NameReplaceAll::name, FunctionFactory::CaseInsensitive);
}
}

View File

@ -46,7 +46,7 @@ namespace DB
* multiSearchAllPositionsUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositionsCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchAllPositionsCaseInsensitiveUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
*
* multiSearchFirstPosition(haystack, [pattern_1, pattern_2, ..., pattern_n]) -- returns the first position of the haystack matched by strings or zero if nothing was found
* multiSearchFirstPositionUTF8(haystack, [pattern_1, pattern_2, ..., pattern_n])
* multiSearchFirstPositionCaseInsensitive(haystack, [pattern_1, pattern_2, ..., pattern_n])
@ -291,7 +291,8 @@ public:
static FunctionPtr create(const Context & context)
{
if (Impl::is_using_hyperscan && !context.getSettingsRef().allow_hyperscan)
throw Exception("Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
throw Exception(
"Hyperscan functions are disabled, because setting 'allow_hyperscan' is set to 0", ErrorCodes::FUNCTION_NOT_ALLOWED);
return std::make_shared<FunctionsMultiStringSearch>();
}
@ -355,7 +356,6 @@ public:
vec_res.resize(column_haystack_size);
/// TODO support constant_constant version
if (col_haystack_vector)
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), refs, vec_res);
else

View File

@ -318,32 +318,32 @@ struct NgramDistanceImpl
};
struct NgramDistanceName
struct NameNgramDistance
{
static constexpr auto name = "ngramDistance";
};
struct NgramDistanceCaseInsensitiveName
struct NameNgramDistanceCaseInsensitive
{
static constexpr auto name = "ngramDistanceCaseInsensitive";
};
struct NgramDistanceUTF8Name
struct NameNgramDistanceUTF8
{
static constexpr auto name = "ngramDistanceUTF8";
};
struct NgramDistanceUTF8CaseInsensitiveName
struct NameNgramDistanceUTF8CaseInsensitive
{
static constexpr auto name = "ngramDistanceCaseInsensitiveUTF8";
};
using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false>, NgramDistanceName>;
using FunctionNgramDistance = FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, false>, NameNgramDistance>;
using FunctionNgramDistanceCaseInsensitive
= FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true>, NgramDistanceCaseInsensitiveName>;
using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false>, NgramDistanceUTF8Name>;
= FunctionsStringSimilarity<NgramDistanceImpl<4, UInt8, false, true>, NameNgramDistanceCaseInsensitive>;
using FunctionNgramDistanceUTF8 = FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, false>, NameNgramDistanceUTF8>;
using FunctionNgramDistanceCaseInsensitiveUTF8
= FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true>, NgramDistanceUTF8CaseInsensitiveName>;
= FunctionsStringSimilarity<NgramDistanceImpl<3, UInt32, true, true>, NameNgramDistanceUTF8CaseInsensitive>;
void registerFunctionsStringSimilarity(FunctionFactory & factory)
{

View File

@ -9,12 +9,14 @@
namespace DB
{
/** Calculate similarity metrics:
*
* ngramDistance(haystack, needle) --- calculate n-gram distance between haystack and needle.
* ngramDistance(haystack, needle) - calculate n-gram distance between haystack and needle.
* Returns float number from 0 to 1 - the closer to zero, the more strings are similar to each other.
* Also support CaseInsensitive and UTF8 formats.
* ngramDistanceCaseInsensitive(haystack, needle)
* ngramDistanceUTF8(haystack, needle)
* ngramDistanceCaseInsensitiveUTF8(haystack, needle)
*/
namespace ErrorCodes
@ -70,11 +72,13 @@ public:
if (needle.size() > Impl::max_string_size)
{
throw Exception(
"String size of needle is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size),
ErrorCodes::TOO_LARGE_STRING_SIZE);
"String size of needle is too big for function " + getName() + ". Should be at most "
+ std::to_string(Impl::max_string_size),
ErrorCodes::TOO_LARGE_STRING_SIZE);
}
Impl::constant_constant(col_haystack_const->getValue<String>(), needle, res);
block.getByPosition(result).column = block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
block.getByPosition(result).column
= block.getByPosition(result).type->createColumnConst(col_haystack_const->size(), toField(res));
return;
}
@ -91,11 +95,11 @@ public:
if (needle.size() > Impl::max_string_size)
{
throw Exception(
"String size of needle is too big for function " + getName() + ". Should be at most " + std::to_string(Impl::max_string_size),
ErrorCodes::TOO_LARGE_STRING_SIZE);
"String size of needle is too big for function " + getName() + ". Should be at most "
+ std::to_string(Impl::max_string_size),
ErrorCodes::TOO_LARGE_STRING_SIZE);
}
Impl::vector_constant(
col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle, vec_res);
Impl::vector_constant(col_haystack_vector->getChars(), col_haystack_vector->getOffsets(), needle, vec_res);
}
else
{

View File

@ -1,21 +1,23 @@
#pragma once
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
#include <Functions/likePatternToRegexp.h>
#include <Common/ObjectPool.h>
#include <Common/OptimizedRegularExpression.h>
#include <Common/ProfileEvents.h>
#include <common/StringRef.h>
#include <memory>
#include <string>
#include <vector>
#include <Common/config.h>
#if USE_HYPERSCAN
# if __has_include(<hs/hs.h>)
# include <hs/hs.h>
# else
# include <hs.h>
# endif
# if __has_include(<hs/hs.h>)
# include <hs/hs.h>
# else
# include <hs.h>
# endif
#endif
namespace ProfileEvents
@ -81,12 +83,15 @@ namespace MultiRegexps
}
};
using CompilerError = std::unique_ptr<hs_compile_error_t, HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>>;
using ScratchPtr = std::unique_ptr<hs_scratch_t, DB::MultiRegexps::HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>>;
using Regexps = std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>>;
using Pool = ObjectPoolMap<Regexps, std::vector<String>>;
using Pool = ObjectPoolMap<Regexps, std::pair<std::vector<String>, std::optional<UInt32>>>;
template <bool FindAnyIndex>
inline Pool::Pointer get(const std::vector<StringRef> & patterns)
/// If CompileForEditDistance is False, edit_distance must be nullopt
template <bool FindAnyIndex, bool CompileForEditDistance>
inline Pool::Pointer get(const std::vector<StringRef> & patterns, std::optional<UInt32> edit_distance)
{
/// C++11 has thread-safe function-local statics on most modern compilers.
static Pool known_regexps; /// Different variables for different pattern parameters.
@ -96,16 +101,36 @@ namespace MultiRegexps
for (const StringRef & ref : patterns)
str_patterns.push_back(ref.toString());
return known_regexps.get(str_patterns, [&str_patterns]
return known_regexps.get({str_patterns, edit_distance}, [&str_patterns, edit_distance]
{
/// Common pointers
std::vector<const char *> ptrns;
std::vector<unsigned int> flags;
/// Pointer for external edit distance compilation
std::vector<hs_expr_ext> ext_exprs;
std::vector<const hs_expr_ext *> ext_exprs_ptrs;
ptrns.reserve(str_patterns.size());
flags.reserve(str_patterns.size());
if constexpr (CompileForEditDistance)
{
ext_exprs.reserve(str_patterns.size());
ext_exprs_ptrs.reserve(str_patterns.size());
}
for (const StringRef ref : str_patterns)
{
ptrns.push_back(ref.data);
flags.push_back(HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_SINGLEMATCH);
if constexpr (CompileForEditDistance)
{
ext_exprs.emplace_back();
ext_exprs.back().flags = HS_EXT_FLAG_EDIT_DISTANCE;
ext_exprs.back().edit_distance = edit_distance.value();
ext_exprs_ptrs.push_back(&ext_exprs.back());
}
}
hs_database_t * db = nullptr;
hs_compile_error_t * compile_error;
@ -120,13 +145,32 @@ namespace MultiRegexps
ids[i] = i + 1;
}
hs_error_t err
= hs_compile_multi(ptrns.data(), flags.data(), ids.get(), ptrns.size(), HS_MODE_BLOCK, nullptr, &db, &compile_error);
hs_error_t err;
if constexpr (!CompileForEditDistance)
err = hs_compile_multi(
ptrns.data(),
flags.data(),
ids.get(),
ptrns.size(),
HS_MODE_BLOCK,
nullptr,
&db,
&compile_error);
else
err = hs_compile_ext_multi(
ptrns.data(),
flags.data(),
ids.get(),
ext_exprs_ptrs.data(),
ptrns.size(),
HS_MODE_BLOCK,
nullptr,
&db,
&compile_error);
if (err != HS_SUCCESS)
{
std::unique_ptr<
hs_compile_error_t,
HyperscanDeleter<decltype(&hs_free_compile_error), &hs_free_compile_error>> error(compile_error);
CompilerError error(compile_error);
if (error->expression < 0)
throw Exception(String(error->message), ErrorCodes::LOGICAL_ERROR);

View File

@ -1,12 +1,10 @@
#include <Common/config.h>
#include <Functions/registerFunctions.h>
#include <Functions/FunctionFactory.h>
#include <Functions/registerFunctions.h>
#include <Common/config.h>
namespace DB
{
/** These functions are defined in a separate translation units.
* This is done in order to reduce the consumption of RAM during build, and to speed up the parallel build.
*/
@ -34,6 +32,7 @@ void registerFunctionsRound(FunctionFactory &);
void registerFunctionsString(FunctionFactory &);
void registerFunctionsStringArray(FunctionFactory &);
void registerFunctionsStringSearch(FunctionFactory &);
void registerFunctionsStringRegex(FunctionFactory &);
void registerFunctionsStringSimilarity(FunctionFactory &);
void registerFunctionsURL(FunctionFactory &);
void registerFunctionsVisitParam(FunctionFactory &);
@ -75,6 +74,7 @@ void registerFunctions()
registerFunctionsString(factory);
registerFunctionsStringArray(factory);
registerFunctionsStringSearch(factory);
registerFunctionsStringRegex(factory);
registerFunctionsStringSimilarity(factory);
registerFunctionsURL(factory);
registerFunctionsVisitParam(factory);

View File

@ -0,0 +1,32 @@
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1

View File

@ -0,0 +1,26 @@
SET send_logs_level = 'none';
select 0 = multiMatchAnyEditDistance('abc', 0, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('abc', 1, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('abc', 2, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('abc', 3, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
select 1 = multiMatchAnyEditDistance('abc', 4, ['a1c']) from system.numbers limit 5; -- { serverError 49 }
select 1 = multiMatchAnyEditDistance('leftabcright', 1, ['a1c']) from system.numbers limit 5;
select 1 = multiMatchAnyEditDistance('hello some world', 0, ['^hello.*world$']);
select 1 = multiMatchAnyEditDistance('hallo some world', 1, ['^hello.*world$']);
select 0 = multiMatchAnyEditDistance('halo some wrld', 2, ['^hello.*world$']);
select 1 = multiMatchAnyEditDistance('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
select 1 = multiMatchAnyEditDistance('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);
select 1 = multiMatchAnyEditDistance('halo some wrld', 3, ['^hello.*world$']);
select 1 = multiMatchAnyEditDistance('hello some world', 10, ['^hello.*world$']); -- { serverError 49 }
select 1 = multiMatchAnyEditDistance('hello some world', -1, ['^hello.*world$']); -- { serverError 43 }
select 1 = multiMatchAnyEditDistance('hello some world', 10000000000, ['^hello.*world$']); -- { serverError 44 }
select 1 = multiMatchAnyEditDistance('http://hyperscan_is_nice.ru/st', 2, ['http://hyperscan_is_nice.ru/(st\\d\\d$|st\\d\\d\\.|st1[0-4]\\d|st150|st\\d$|gl|rz|ch)']);
select 0 = multiMatchAnyEditDistance('string', 0, ['zorro$', '^tring', 'in$', 'how.*', 'it{2}', 'works']);
select 1 = multiMatchAnyEditDistance('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
select 2 = multiMatchAnyIndexEditDistance('string', 1, ['zorro$', '^tring', 'ip$', 'how.*', 'it{2}', 'works']);
select 2 = multiMatchAnyIndexEditDistance('halo some wrld', 2, ['^hello.*world$', '^halo.*world$']);
select 1 = multiMatchAnyIndexEditDistance('halo some wrld', 2, ['^halo.*world$', '^hello.*world$']);

View File

@ -39,7 +39,7 @@ Returns 1, if at least one string `needle_i` matches the string `haystack` and 0
For a case-insensitive search or/and in UTF-8 format use functions `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
Note: in all `multiSearch*` functions the number of needles should be less than 2<sup>8</sup> because of implementation specification.
**Note: in all `multiSearch*` functions the number of needles should be less than 2<sup>8</sup> because of implementation specification.**
## match(haystack, pattern)
@ -56,12 +56,22 @@ For patterns to search for substrings in a string, it is better to use LIKE or '
The same as `match`, but returns 0 if none of the regular expressions are matched and 1 if any of the patterns matches. It uses [hyperscan](https://github.com/intel/hyperscan) library. For patterns to search substrings in a string, it is better to use `multiSearchAny` since it works much faster.
Note: this function is in experimental mode because of some [issues](https://github.com/intel/hyperscan/issues/141). The length of any of the `haystack` string must be less than 2<sup>32</sup> bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API.
**Note: this function is in experimental mode because of some [issues](https://github.com/intel/hyperscan/issues/141). The length of any of the `haystack` string must be less than 2<sup>32</sup> bytes otherwise the exception is thrown. This restriction takes place because of hyperscan API.**
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAny`, but returns any index that matches the haystack.
## multiMatchAnyEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAny`, but returns 1 if any pattern matches the haystack within constant [edit distance](https://en.wikipedia.org/wiki/Edit_distance). This function is also in an experimental mode and can be extremely slow. For more information see [hyperscan documentation](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
## multiMatchAnyIndexEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
The same as `multiMatchAnyEditDistance`, but returns any index that matches the haystack within constant edit distance.
**Note: to turn off all functions that use hyperscan, use setting `SET allow_hyperscan = 0;`.**
## extract(haystack, pattern)
Extracts a fragment of a string using a regular expression. If 'haystack' doesn't match the 'pattern' regex, an empty string is returned. If the regex doesn't contain subpatterns, it takes the fragment that matches the entire regex. Otherwise, it takes the fragment that matches the first subpattern.
@ -94,7 +104,7 @@ Calculates the 4-gram distance between `haystack` and `needle`: counts the symme
For case-insensitive search or/and in UTF-8 format use functions `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
Notes: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.
**Note: For UTF-8 case we use 3-gram distance. All these are not perfectly fair n-gram distances. We use 2-byte hashes to hash n-grams and then calculate the symmetric difference between these hash tables -- collisions may occur. With UTF-8 case-insensitive format we do not use fair `tolower` function -- we zero the 5-th bit (starting from zero) of each codepoint byte -- this works for Latin and mostly for all Cyrillic letters.**
[Original article](https://clickhouse.yandex/docs/en/query_language/functions/string_search_functions/) <!--hide-->

View File

@ -34,7 +34,7 @@
Для поиска без учета регистра и/или в кодировке UTF-8 используйте функции `multiSearchAnyCaseInsensitive, multiSearchAnyUTF8, multiSearchAnyCaseInsensitiveUTF8`.
Примечание: во всех функциях `multiSearch*` количество needles должно быть меньше 2<sup>8</sup> из-за внутренностей реализации.
**Примечание: во всех функциях `multiSearch*` количество needles должно быть меньше 2<sup>8</sup> из-за внутренностей реализации.**
## match(haystack, pattern)
Проверка строки на соответствие регулярному выражению pattern. Регулярное выражение **re2**. Синтаксис регулярных выражений **re2** является более ограниченным по сравнению с регулярными выражениями **Perl** ([подробнее](https://github.com/google/re2/wiki/Syntax)).
@ -49,11 +49,21 @@
То же, что и `match`, но возвращает ноль, если ни одно регулярное выражение не подошло и один, если хотя бы одно. Используется библиотека [hyperscan](https://github.com/intel/hyperscan) для соответствия регулярных выражений. Для шаблонов на поиск многих подстрок в строке, лучше используйте `multiSearchAny`, так как она работает существенно быстрее.
Примечание: эта функция находится пока в экспериментальном режиме из-за некоторых [проблем](https://github.com/intel/hyperscan/issues/141). Длина любой строки из `haystack` должна быть меньше 2<sup>32</sup> байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API.
**Примечание: эта функция находится пока в экспериментальном режиме из-за некоторых [проблем](https://github.com/intel/hyperscan/issues/141). Длина любой строки из `haystack` должна быть меньше 2<sup>32</sup> байт, иначе бросается исключение. Это ограничение связано с ограничением hyperscan API.**
## multiMatchAnyIndex(haystack, [pattern_1, pattern_2, ..., pattern_n])
То же, что `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
То же, что и `multiMatchAny`, только возвращает любой индекс подходящего регулярного выражения.
## multiMatchAnyEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `multiMatchAny`, но возвращает 1 если любой pattern соответствует haystack в пределах константного [редакционного расстояния](https://en.wikipedia.org/wiki/Edit_distance). Эта функция также находится в экспериментальном режиме и может быть очень медленной. За подробностями обращайтесь к [документации hyperscan](https://intel.github.io/hyperscan/dev-reference/compilation.html#approximate-matching).
## multiMatchAnyIndexEditDistance(haystack, distance, [pattern_1, pattern_2, ..., pattern_n])
То же, что и `multiMatchAnyEditDistance`, только возвращает любой индекс подходящего регулярного выражения в пределах константного редакционного расстояния.
**Примечание: чтобы выключить все функции, использующие hyperscan, используйте настройку `SET allow_hyperscan = 0;`.**
## extract(haystack, pattern)
Извлечение фрагмента строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Если регулярное выражение не содержит subpattern-ов, то вынимается фрагмент, который подпадает под всё регулярное выражение. Иначе вынимается фрагмент, который подпадает под первый subpattern.
@ -83,6 +93,6 @@
Для поиска без учета регистра и/или в формате UTF-8 используйте функции `ngramDistanceCaseInsensitive, ngramDistanceUTF8, ngramDistanceCaseInsensitiveUTF8`.
Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами -- могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` -- мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв.
**Примечание: для случая UTF-8 мы используем триграммное расстояние. Вычисление n-граммного расстояния не совсем честное. Мы используем 2-х байтные хэши для хэширования n-грамм, а затем вычисляем симметричную разность между хэш таблицами -- могут возникнуть коллизии. В формате UTF-8 без учета регистра мы не используем честную функцию `tolower` -- мы обнуляем 5-й бит (нумерация с нуля) каждого байта кодовой точки -- это работает для латиницы и почти для всех кириллических букв.**
[Оригинальная статья](https://clickhouse.yandex/docs/ru/query_language/functions/string_search_functions/) <!--hide-->