Merge pull request #47218 from hanfei1991/hanfei/optimize-regexp-tree-1

Refine OptimizeRegularExpression Function and RegexpTreeDict
This commit is contained in:
Han Fei 2023-03-27 15:23:01 +02:00 committed by GitHub
commit e3afa5090f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 400 additions and 126 deletions

View File

@ -1,3 +1,4 @@
#include <limits>
#include <Common/Exception.h>
#include <Common/PODArray.h>
#include <Common/OptimizedRegularExpression.h>
@ -14,13 +15,40 @@ namespace DB
}
}
namespace
{
template <bool thread_safe>
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
struct Literal
{
std::string literal;
bool prefix; /// this literal string is the prefix of the whole string.
bool suffix; /// this literal string is the suffix of the whole string.
void clear()
{
literal.clear();
prefix = false;
suffix = false;
}
};
using Literals = std::vector<Literal>;
size_t shortest_literal_length(const Literals & literals)
{
if (literals.empty()) return 0;
size_t shortest = std::numeric_limits<size_t>::max();
for (const auto & lit : literals)
if (shortest > lit.literal.size())
shortest = lit.literal.size();
return shortest;
}
const char * analyzeImpl(
std::string_view regexp,
std::string & required_substring,
const char * pos,
Literal & required_substring,
bool & is_trivial,
bool & required_substring_is_prefix)
Literals & global_alternatives)
{
/** The expression is trivial if all the metacharacters in it are escaped.
* The non-alternative string is
@ -30,12 +58,11 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
* and also avoid substrings of the form `http://` or `www` and some other
* (this is the hack for typical use case in web analytics applications).
*/
const char * begin = regexp.data();
const char * pos = begin;
const char * begin = pos;
const char * end = regexp.data() + regexp.size();
bool is_first_call = begin == regexp.data();
int depth = 0;
is_trivial = true;
required_substring_is_prefix = false;
required_substring.clear();
bool has_alternative_on_depth_0 = false;
bool has_case_insensitive_flag = false;
@ -47,6 +74,80 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
Substrings trivial_substrings(1);
Substring * last_substring = &trivial_substrings.back();
Literals cur_alternatives;
auto finish_cur_alternatives = [&]()
{
if (cur_alternatives.empty())
return;
if (global_alternatives.empty())
{
global_alternatives = cur_alternatives;
cur_alternatives.clear();
return;
}
/// that means current alternatives have better quality.
if (shortest_literal_length(global_alternatives) < shortest_literal_length(cur_alternatives))
{
global_alternatives.clear();
global_alternatives = cur_alternatives;
}
cur_alternatives.clear();
};
auto finish_non_trivial_char = [&](bool create_new_substr = true)
{
if (depth != 0)
return;
for (auto & alter : cur_alternatives)
{
if (alter.suffix)
{
alter.literal += last_substring->first;
}
}
finish_cur_alternatives();
if (!last_substring->first.empty() && create_new_substr)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
};
/// Resolve the string or alters in a group (xxxxx)
auto finish_group = [&](Literal & group_required_string, Literals & group_alternatives)
{
for (auto & alter : group_alternatives)
{
if (alter.prefix)
{
alter.literal = last_substring->first + alter.literal;
}
}
if (group_required_string.prefix)
last_substring->first += group_required_string.literal;
else
{
finish_non_trivial_char();
last_substring->first = group_required_string.literal;
}
/// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
if (!last_substring->first.empty() && !group_required_string.suffix)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
/// assign group alters to current alters.
finish_cur_alternatives();
cur_alternatives = std::move(group_alternatives);
};
bool in_curly_braces = false;
bool in_square_braces = false;
@ -73,25 +174,19 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
case '$':
case '.':
case '[':
case ']':
case '?':
case '*':
case '+':
case '-':
case '{':
if (depth == 0 && !in_curly_braces && !in_square_braces)
{
if (last_substring->first.empty())
last_substring->second = pos - begin;
last_substring->first.push_back(*pos);
}
break;
case '}':
case '/':
goto ordinary;
default:
/// all other escape sequences are not supported
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
finish_non_trivial_char();
break;
}
@ -100,28 +195,19 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
}
case '|':
if (depth == 0)
has_alternative_on_depth_0 = true;
is_trivial = false;
if (!in_square_braces && !last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
++pos;
if (depth == 0)
{
has_alternative_on_depth_0 = true;
goto finish;
}
break;
case '(':
is_trivial = false;
if (!in_square_braces)
{
++depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
/// Check for case-insensitive flag.
if (pos + 1 < end && pos[1] == '?')
{
@ -143,6 +229,28 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
break;
}
}
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
{
pos += 2;
}
Literal group_required_substr;
bool group_is_trival = true;
Literals group_alters;
pos = analyzeImpl(regexp, pos + 1, group_required_substr, group_is_trival, group_alters);
/// pos should be ')', if not, then it is not a valid regular expression
if (pos == end)
return pos;
/// For ()? or ()* or (){0,1}, we can just ignore the whole group.
if ((pos + 1 < end && (pos[1] == '?' || pos[1] == '*')) ||
(pos + 2 < end && pos[1] == '{' && pos[2] == '0'))
{
finish_non_trivial_char();
}
else
{
finish_group(group_required_substr, group_alters);
}
}
++pos;
break;
@ -151,11 +259,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
in_square_braces = true;
++depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
finish_non_trivial_char();
++pos;
break;
@ -163,38 +267,25 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
if (!in_square_braces)
goto ordinary;
in_square_braces = false;
--depth;
if (depth == 0)
in_square_braces = false;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
finish_non_trivial_char();
++pos;
break;
case ')':
if (!in_square_braces)
{
--depth;
is_trivial = false;
if (!last_substring->first.empty())
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
goto finish;
}
++pos;
break;
case '^': case '$': case '.': case '+':
is_trivial = false;
if (!last_substring->first.empty() && !in_square_braces)
{
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
finish_non_trivial_char();
++pos;
break;
@ -206,12 +297,11 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
[[fallthrough]];
case '*':
is_trivial = false;
if (!last_substring->first.empty() && !in_square_braces)
if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
{
last_substring->first.resize(last_substring->first.size() - 1);
trivial_substrings.resize(trivial_substrings.size() + 1);
last_substring = &trivial_substrings.back();
}
finish_non_trivial_char();
++pos;
break;
@ -236,13 +326,15 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
break;
}
}
finish:
if (last_substring && last_substring->first.empty())
trivial_substrings.pop_back();
finish_non_trivial_char(false);
if (!is_trivial)
{
if (!has_alternative_on_depth_0 && !has_case_insensitive_flag)
/// we calculate required substring even though has_alternative_on_depth_0.
/// we will clear the required substring after putting it to alternatives.
if (!has_case_insensitive_flag)
{
/// We choose the non-alternative substring of the maximum length for first search.
@ -262,19 +354,45 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
}
}
if (max_length >= MIN_LENGTH_FOR_STRSTR)
if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
{
required_substring = candidate_it->first;
required_substring_is_prefix = candidate_it->second == 0;
required_substring.literal = candidate_it->first;
required_substring.prefix = candidate_it->second == 0;
required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
}
}
}
else if (!trivial_substrings.empty())
{
required_substring = trivial_substrings.front().first;
required_substring_is_prefix = trivial_substrings.front().second == 0;
required_substring.literal = trivial_substrings.front().first;
required_substring.prefix = trivial_substrings.front().second == 0;
required_substring.suffix = true;
}
/// if it is xxx|xxx|xxx, we should call the next xxx|xxx recursively and collect the result.
if (has_alternative_on_depth_0)
{
/// compare the quality of required substring and alternatives and choose the better one.
if (shortest_literal_length(global_alternatives) < required_substring.literal.size())
global_alternatives = {required_substring};
Literals next_alternatives;
/// this two vals are useless, xxx|xxx cannot be trivial nor prefix.
bool next_is_trivial = true;
pos = analyzeImpl(regexp, pos, required_substring, next_is_trivial, next_alternatives);
/// For xxx|xxx|xxx, we only conbine the alternatives and return a empty required_substring.
if (next_alternatives.empty() || shortest_literal_length(next_alternatives) < required_substring.literal.size())
{
global_alternatives.push_back(required_substring);
}
else
{
global_alternatives.insert(global_alternatives.end(), next_alternatives.begin(), next_alternatives.end());
}
required_substring.clear();
}
return pos;
/* std::cerr
<< "regexp: " << regexp
<< ", is_trivial: " << is_trivial
@ -282,12 +400,31 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
<< ", required_substring_is_prefix: " << required_substring_is_prefix
<< std::endl;*/
}
}
template <bool thread_safe>
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
std::string_view regexp_,
std::string & required_substring,
bool & is_trivial,
bool & required_substring_is_prefix,
std::vector<std::string> & alternatives)
{
Literals alternative_literals;
Literal required_literal;
analyzeImpl(regexp_, regexp_.data(), required_literal, is_trivial, alternative_literals);
required_substring = std::move(required_literal.literal);
required_substring_is_prefix = required_literal.prefix;
for (auto & lit : alternative_literals)
alternatives.push_back(std::move(lit.literal));
}
template <bool thread_safe>
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
{
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
std::vector<std::string> alternativesDummy; /// this vector extracts patterns a,b,c from pattern (a|b|c). for now it's not used.
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix, alternativesDummy);
/// Just three following options are supported
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))

View File

@ -95,6 +95,15 @@ public:
out_required_substring_is_prefix = required_substring_is_prefix;
}
/// analyze function will extract the longest string literal or multiple alternative string literals from regexp for pre-checking if
/// a string contains the string literal(s). If not, we can tell this string can never match the regexp.
static void analyze(
std::string_view regexp_,
std::string & required_substring,
bool & is_trivial,
bool & required_substring_is_prefix,
std::vector<std::string> & alternatives);
private:
bool is_trivial;
bool required_substring_is_prefix;
@ -104,8 +113,6 @@ private:
std::optional<DB::ASCIICaseInsensitiveStringSearcher> case_insensitive_substring_searcher;
std::unique_ptr<RegexType> re2;
unsigned number_of_subpatterns;
static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
};
using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;

View File

@ -0,0 +1,46 @@
#include <gtest/gtest.h>
#include <Common/OptimizedRegularExpression.h>
TEST(OptimizeRE, analyze)
{
auto test_f = [](const std::string & regexp, const std::string & answer, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false)
{
std::string required;
bool is_trivial;
bool is_prefix;
std::vector<std::string> alternatives;
OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives);
std::cerr << regexp << std::endl;
EXPECT_EQ(required, answer);
EXPECT_EQ(alternatives, expect_alternatives);
EXPECT_EQ(is_trivial, trival_expected);
};
test_f("abc", "abc", {}, true);
test_f("c([^k]*)de", "");
test_f("abc(de)fg", "abcdefg");
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"});
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"});
test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"});
test_f("(abc)", "abc");
test_f("(abc|fgk)", "", {"abc","fgk"});
test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"});
test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"});
test_f("abc[k]xyzz", "xyzz");
test_f("(abc[k]xyzz)", "xyzz");
test_f("abc((de)fg(hi))jk", "abcdefghijk");
test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk");
test_f("abc((de)fghi+zzz)jk", "abcdefghi");
test_f("abc((de)fg(hi))?jk", "abc");
test_f("abc((de)fghi?zzz)jk", "abcdefgh");
test_f("abc(*cd)jk", "cdjk");
test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"});
test_f("abc(abc|fg)?xyzz", "xyzz");
test_f("abc(abc|fg){0,1}xyzz", "xyzz");
test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"});
test_f("abc(abc|fg)xyzz|bc(dd?x|kk?y|(f))k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bck", "bcfk", "bc"});
test_f("((?:abc|efg|xyz)/[a-zA-Z0-9]{1-50})(/?[^ ]*|)", "", {"abc/", "efg/", "xyz/"});
test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"});
test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"});
test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"});
}

View File

@ -934,7 +934,7 @@ class IColumn;
M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \
\
M(Bool, regexp_dict_allow_other_sources, false, "Allow regexp_tree dictionary to use sources other than yaml source.", 0) \
M(Bool, regexp_dict_allow_hyperscan, false, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
// End of FORMAT_FACTORY_SETTINGS
// Please add settings non-related to formats into the COMMON_SETTINGS above.

View File

@ -9,9 +9,10 @@
#include <Poco/Logger.h>
#include <Poco/RegularExpression.h>
#include "Common/Exception.h"
#include <Common/ArenaUtils.h>
#include <Common/Exception.h>
#include <Common/logger_useful.h>
#include <Common/OptimizedRegularExpression.h>
#include <Core/ColumnsWithTypeAndName.h>
#include <DataTypes/DataTypeString.h>
#include <DataTypes/DataTypesNumber.h>
@ -34,6 +35,7 @@
#if USE_VECTORSCAN
# include <hs.h>
# include <hs_compile.h>
#endif
namespace DB
@ -46,6 +48,7 @@ namespace ErrorCodes
extern const int HYPERSCAN_CANNOT_SCAN_TEXT;
extern const int UNSUPPORTED_METHOD;
extern const int INCORRECT_DICTIONARY_DEFINITION;
extern const int LOGICAL_ERROR;
}
const std::string kRegExp = "regexp";
@ -172,10 +175,6 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
auto keys_column = block.getByName(kKeys).column;
auto values_column = block.getByName(kValues).column;
#ifdef USE_VECTORSCAN
SlowWithHyperscanChecker checker;
#endif
size_t size = block.rows();
for (size_t i = 0; i < size; i++)
{
@ -219,12 +218,36 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
}
}
regex_nodes.emplace(id, node);
#if USE_VECTORSCAN
if (use_vectorscan && !checker.isSlow(regex))
String required_substring;
bool is_trivial, required_substring_is_prefix;
std::vector<std::string> alternatives;
if (use_vectorscan)
OptimizedRegularExpression::analyze(regex, required_substring, is_trivial, required_substring_is_prefix, alternatives);
for (auto & alter : alternatives)
{
simple_regexps.push_back(regex);
if (alter.size() < 3)
{
alternatives.clear();
break;
}
}
if (!required_substring.empty())
{
simple_regexps.push_back(required_substring);
regexp_ids.push_back(id);
}
else if (!alternatives.empty())
{
for (auto & alternative : alternatives)
{
simple_regexps.push_back(alternative);
regexp_ids.push_back(id);
}
}
else
#endif
complex_regexp_nodes.push_back(node);
@ -284,20 +307,50 @@ void RegExpTreeDictionary::loadData()
use_vectorscan = false;
if (!use_vectorscan)
return;
#if USE_VECTORSCAN
try
#if USE_VECTORSCAN
std::vector<const char *> patterns;
std::vector<unsigned int> flags;
std::vector<size_t> lengths;
for (const std::string & simple_regexp : simple_regexps)
{
std::vector<std::string_view> regexps_views(simple_regexps.begin(), simple_regexps.end());
hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
hyperscan_regex->get();
patterns.push_back(simple_regexp.data());
lengths.push_back(simple_regexp.size());
flags.push_back(HS_FLAG_SINGLEMATCH);
}
catch (Exception & e)
hs_database_t * db = nullptr;
hs_compile_error_t * compile_error;
std::unique_ptr<unsigned int[]> ids;
ids.reset(new unsigned int[patterns.size()]);
for (size_t i = 0; i < patterns.size(); i++)
ids[i] = static_cast<unsigned>(i+1);
hs_error_t err = hs_compile_lit_multi(patterns.data(), flags.data(), ids.get(), lengths.data(), static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db, &compile_error);
origin_db = (db);
if (err != HS_SUCCESS)
{
/// Some compile errors will be thrown as LOGICAL ERROR and cause crash, e.g. empty expression or expressions are too large.
/// We catch the error here and rethrow again.
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Error occurs when compiling regular expressions, reason: {}", e.message());
/// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
MultiRegexps::CompilerErrorPtr error(compile_error);
if (error->expression < 0)
throw Exception::createRuntime(ErrorCodes::LOGICAL_ERROR, String(error->message));
else
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Pattern '{}' failed with error '{}'", patterns[error->expression], String(error->message));
}
#endif
/// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
/// function which is faster than allocating scratch space each time in each thread.
hs_scratch_t * scratch = nullptr;
err = hs_alloc_scratch(db, &scratch);
origin_scratch.reset(scratch);
/// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
if (err != HS_SUCCESS)
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for vectorscan");
#endif
}
else
{
@ -396,47 +449,70 @@ bool RegExpTreeDictionary::setAttributes(
return attributes_to_set.size() == attributes.size();
}
namespace
/// a temp struct to store all the matched result.
struct MatchContext
{
struct MatchContext
std::set<UInt64> matched_idx_set;
std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
const std::vector<UInt64> & regexp_ids ;
const std::unordered_map<UInt64, UInt64> & topology_order;
const char * data;
size_t length;
const std::map<UInt64, RegExpTreeDictionary::RegexTreeNodePtr> & regex_nodes;
size_t pre_match_counter = 0;
size_t match_counter = 0;
MatchContext(
const std::vector<UInt64> & regexp_ids_,
const std::unordered_map<UInt64, UInt64> & topology_order_,
const char * data_, size_t length_,
const std::map<UInt64, RegExpTreeDictionary::RegexTreeNodePtr> & regex_nodes_
)
: regexp_ids(regexp_ids_),
topology_order(topology_order_),
data(data_),
length(length_),
regex_nodes(regex_nodes_)
{}
[[maybe_unused]]
void insertIdx(unsigned int idx)
{
std::set<UInt64> matched_idx_set;
std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
const std::vector<UInt64> & regexp_ids ;
const std::unordered_map<UInt64, UInt64> & topology_order;
MatchContext(const std::vector<UInt64> & regexp_ids_, const std::unordered_map<UInt64, UInt64> & topology_order_)
: regexp_ids(regexp_ids_), topology_order(topology_order_) {}
[[maybe_unused]]
void insertIdx(unsigned int idx)
UInt64 node_id = regexp_ids[idx-1];
pre_match_counter++;
if (!regex_nodes.at(node_id)->match(data, length))
{
UInt64 node_id = regexp_ids[idx-1];
UInt64 topological_order = topology_order.at(node_id);
matched_idx_set.emplace(node_id);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
return;
}
match_counter++;
matched_idx_set.emplace(node_id);
void insertNodeID(UInt64 id)
{
UInt64 topological_order = topology_order.at(id);
matched_idx_set.emplace(id);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
}
UInt64 topological_order = topology_order.at(node_id);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
}
/// Sort by topological order, which indicates the matching priorities.
void sort()
{
std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
}
[[maybe_unused]]
void insertNodeID(UInt64 id)
{
matched_idx_set.emplace(id);
bool contains(UInt64 idx) const
{
return matched_idx_set.contains(idx);
}
};
}
UInt64 topological_order = topology_order.at(id);
matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
}
/// Sort by topological order, which indicates the matching priorities.
void sort()
{
std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
}
bool contains(UInt64 idx) const
{
return matched_idx_set.contains(idx);
}
};
std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
const ColumnString::Chars & keys_data,
@ -449,7 +525,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
hs_scratch_t * scratch = nullptr;
if (use_vectorscan)
{
hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
hs_error_t err = hs_clone_scratch(origin_scratch.get(), &scratch);
if (err != HS_SUCCESS)
{
@ -476,11 +552,14 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
auto key_offset = keys_offsets[key_idx];
UInt64 length = key_offset - offset - 1;
MatchContext match_result(regexp_ids, topology_order);
const char * begin = reinterpret_cast<const char *>(keys_data.data()) + offset;
MatchContext match_result(regexp_ids, topology_order, begin, length, regex_nodes);
#if USE_VECTORSCAN
if (use_vectorscan)
{
/// pre-select all the possible matches
auto on_match = [](unsigned int id,
unsigned long long /* from */, // NOLINT
unsigned long long /* to */, // NOLINT
@ -490,8 +569,9 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
static_cast<MatchContext *>(context)->insertIdx(id);
return 0;
};
hs_error_t err = hs_scan(
hyperscan_regex->get()->getDB(),
origin_db,
reinterpret_cast<const char *>(keys_data.data()) + offset,
static_cast<unsigned>(length),
0,
@ -501,6 +581,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
if (err != HS_SUCCESS)
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Failed to scan data with vectorscan");
}
#endif

View File

@ -33,6 +33,7 @@ namespace ErrorCodes
class RegExpTreeDictionary : public IDictionary
{
friend struct MatchContext;
public:
struct Configuration
{
@ -162,6 +163,8 @@ private:
std::unordered_map<UInt64, UInt64> topology_order;
#if USE_VECTORSCAN
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
MultiRegexps::ScratchPtr origin_scratch;
hs_database_t* origin_db;
#endif
Poco::Logger * logger;