mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-09-21 09:10:48 +00:00
Merge pull request #47218 from hanfei1991/hanfei/optimize-regexp-tree-1
Refine OptimizeRegularExpression Function and RegexpTreeDict
This commit is contained in:
commit
e3afa5090f
@ -1,3 +1,4 @@
|
||||
#include <limits>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/PODArray.h>
|
||||
#include <Common/OptimizedRegularExpression.h>
|
||||
@ -14,13 +15,40 @@ namespace DB
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
template <bool thread_safe>
|
||||
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
struct Literal
|
||||
{
|
||||
std::string literal;
|
||||
bool prefix; /// this literal string is the prefix of the whole string.
|
||||
bool suffix; /// this literal string is the suffix of the whole string.
|
||||
void clear()
|
||||
{
|
||||
literal.clear();
|
||||
prefix = false;
|
||||
suffix = false;
|
||||
}
|
||||
};
|
||||
|
||||
using Literals = std::vector<Literal>;
|
||||
|
||||
size_t shortest_literal_length(const Literals & literals)
|
||||
{
|
||||
if (literals.empty()) return 0;
|
||||
size_t shortest = std::numeric_limits<size_t>::max();
|
||||
for (const auto & lit : literals)
|
||||
if (shortest > lit.literal.size())
|
||||
shortest = lit.literal.size();
|
||||
return shortest;
|
||||
}
|
||||
|
||||
const char * analyzeImpl(
|
||||
std::string_view regexp,
|
||||
std::string & required_substring,
|
||||
const char * pos,
|
||||
Literal & required_substring,
|
||||
bool & is_trivial,
|
||||
bool & required_substring_is_prefix)
|
||||
Literals & global_alternatives)
|
||||
{
|
||||
/** The expression is trivial if all the metacharacters in it are escaped.
|
||||
* The non-alternative string is
|
||||
@ -30,12 +58,11 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
* and also avoid substrings of the form `http://` or `www` and some other
|
||||
* (this is the hack for typical use case in web analytics applications).
|
||||
*/
|
||||
const char * begin = regexp.data();
|
||||
const char * pos = begin;
|
||||
const char * begin = pos;
|
||||
const char * end = regexp.data() + regexp.size();
|
||||
bool is_first_call = begin == regexp.data();
|
||||
int depth = 0;
|
||||
is_trivial = true;
|
||||
required_substring_is_prefix = false;
|
||||
required_substring.clear();
|
||||
bool has_alternative_on_depth_0 = false;
|
||||
bool has_case_insensitive_flag = false;
|
||||
@ -47,6 +74,80 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
Substrings trivial_substrings(1);
|
||||
Substring * last_substring = &trivial_substrings.back();
|
||||
|
||||
Literals cur_alternatives;
|
||||
|
||||
auto finish_cur_alternatives = [&]()
|
||||
{
|
||||
if (cur_alternatives.empty())
|
||||
return;
|
||||
|
||||
if (global_alternatives.empty())
|
||||
{
|
||||
global_alternatives = cur_alternatives;
|
||||
cur_alternatives.clear();
|
||||
return;
|
||||
}
|
||||
/// that means current alternatives have better quality.
|
||||
if (shortest_literal_length(global_alternatives) < shortest_literal_length(cur_alternatives))
|
||||
{
|
||||
global_alternatives.clear();
|
||||
global_alternatives = cur_alternatives;
|
||||
}
|
||||
cur_alternatives.clear();
|
||||
};
|
||||
|
||||
auto finish_non_trivial_char = [&](bool create_new_substr = true)
|
||||
{
|
||||
if (depth != 0)
|
||||
return;
|
||||
|
||||
for (auto & alter : cur_alternatives)
|
||||
{
|
||||
if (alter.suffix)
|
||||
{
|
||||
alter.literal += last_substring->first;
|
||||
}
|
||||
}
|
||||
|
||||
finish_cur_alternatives();
|
||||
|
||||
if (!last_substring->first.empty() && create_new_substr)
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
};
|
||||
|
||||
/// Resolve the string or alters in a group (xxxxx)
|
||||
auto finish_group = [&](Literal & group_required_string, Literals & group_alternatives)
|
||||
{
|
||||
for (auto & alter : group_alternatives)
|
||||
{
|
||||
if (alter.prefix)
|
||||
{
|
||||
alter.literal = last_substring->first + alter.literal;
|
||||
}
|
||||
}
|
||||
|
||||
if (group_required_string.prefix)
|
||||
last_substring->first += group_required_string.literal;
|
||||
else
|
||||
{
|
||||
finish_non_trivial_char();
|
||||
last_substring->first = group_required_string.literal;
|
||||
}
|
||||
/// if we can still append, no need to finish it. e.g. abc(de)fg should capture abcdefg
|
||||
if (!last_substring->first.empty() && !group_required_string.suffix)
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
|
||||
/// assign group alters to current alters.
|
||||
finish_cur_alternatives();
|
||||
cur_alternatives = std::move(group_alternatives);
|
||||
};
|
||||
|
||||
bool in_curly_braces = false;
|
||||
bool in_square_braces = false;
|
||||
|
||||
@ -73,25 +174,19 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
case '$':
|
||||
case '.':
|
||||
case '[':
|
||||
case ']':
|
||||
case '?':
|
||||
case '*':
|
||||
case '+':
|
||||
case '-':
|
||||
case '{':
|
||||
if (depth == 0 && !in_curly_braces && !in_square_braces)
|
||||
{
|
||||
if (last_substring->first.empty())
|
||||
last_substring->second = pos - begin;
|
||||
last_substring->first.push_back(*pos);
|
||||
}
|
||||
break;
|
||||
case '}':
|
||||
case '/':
|
||||
goto ordinary;
|
||||
default:
|
||||
/// all other escape sequences are not supported
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty())
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
finish_non_trivial_char();
|
||||
break;
|
||||
}
|
||||
|
||||
@ -100,28 +195,19 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
}
|
||||
|
||||
case '|':
|
||||
if (depth == 0)
|
||||
has_alternative_on_depth_0 = true;
|
||||
is_trivial = false;
|
||||
if (!in_square_braces && !last_substring->first.empty())
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
++pos;
|
||||
if (depth == 0)
|
||||
{
|
||||
has_alternative_on_depth_0 = true;
|
||||
goto finish;
|
||||
}
|
||||
break;
|
||||
|
||||
case '(':
|
||||
is_trivial = false;
|
||||
if (!in_square_braces)
|
||||
{
|
||||
++depth;
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty())
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
|
||||
/// Check for case-insensitive flag.
|
||||
if (pos + 1 < end && pos[1] == '?')
|
||||
{
|
||||
@ -143,6 +229,28 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (pos + 2 < end && pos[1] == '?' && pos[2] == ':')
|
||||
{
|
||||
pos += 2;
|
||||
}
|
||||
Literal group_required_substr;
|
||||
bool group_is_trival = true;
|
||||
Literals group_alters;
|
||||
pos = analyzeImpl(regexp, pos + 1, group_required_substr, group_is_trival, group_alters);
|
||||
/// pos should be ')', if not, then it is not a valid regular expression
|
||||
if (pos == end)
|
||||
return pos;
|
||||
|
||||
/// For ()? or ()* or (){0,1}, we can just ignore the whole group.
|
||||
if ((pos + 1 < end && (pos[1] == '?' || pos[1] == '*')) ||
|
||||
(pos + 2 < end && pos[1] == '{' && pos[2] == '0'))
|
||||
{
|
||||
finish_non_trivial_char();
|
||||
}
|
||||
else
|
||||
{
|
||||
finish_group(group_required_substr, group_alters);
|
||||
}
|
||||
}
|
||||
++pos;
|
||||
break;
|
||||
@ -151,11 +259,7 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
in_square_braces = true;
|
||||
++depth;
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty())
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
|
||||
@ -163,38 +267,25 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
if (!in_square_braces)
|
||||
goto ordinary;
|
||||
|
||||
in_square_braces = false;
|
||||
--depth;
|
||||
if (depth == 0)
|
||||
in_square_braces = false;
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty())
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
|
||||
case ')':
|
||||
if (!in_square_braces)
|
||||
{
|
||||
--depth;
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty())
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
goto finish;
|
||||
}
|
||||
++pos;
|
||||
break;
|
||||
|
||||
case '^': case '$': case '.': case '+':
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty() && !in_square_braces)
|
||||
{
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
|
||||
@ -206,12 +297,11 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
[[fallthrough]];
|
||||
case '*':
|
||||
is_trivial = false;
|
||||
if (!last_substring->first.empty() && !in_square_braces)
|
||||
if (depth == 0 && !last_substring->first.empty() && !in_square_braces)
|
||||
{
|
||||
last_substring->first.resize(last_substring->first.size() - 1);
|
||||
trivial_substrings.resize(trivial_substrings.size() + 1);
|
||||
last_substring = &trivial_substrings.back();
|
||||
}
|
||||
finish_non_trivial_char();
|
||||
++pos;
|
||||
break;
|
||||
|
||||
@ -236,13 +326,15 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
break;
|
||||
}
|
||||
}
|
||||
finish:
|
||||
|
||||
if (last_substring && last_substring->first.empty())
|
||||
trivial_substrings.pop_back();
|
||||
finish_non_trivial_char(false);
|
||||
|
||||
if (!is_trivial)
|
||||
{
|
||||
if (!has_alternative_on_depth_0 && !has_case_insensitive_flag)
|
||||
/// we calculate required substring even though has_alternative_on_depth_0.
|
||||
/// we will clear the required substring after putting it to alternatives.
|
||||
if (!has_case_insensitive_flag)
|
||||
{
|
||||
/// We choose the non-alternative substring of the maximum length for first search.
|
||||
|
||||
@ -262,19 +354,45 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
}
|
||||
}
|
||||
|
||||
if (max_length >= MIN_LENGTH_FOR_STRSTR)
|
||||
if (max_length >= MIN_LENGTH_FOR_STRSTR || (!is_first_call && max_length > 0))
|
||||
{
|
||||
required_substring = candidate_it->first;
|
||||
required_substring_is_prefix = candidate_it->second == 0;
|
||||
required_substring.literal = candidate_it->first;
|
||||
required_substring.prefix = candidate_it->second == 0;
|
||||
required_substring.suffix = candidate_it + 1 == trivial_substrings.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (!trivial_substrings.empty())
|
||||
{
|
||||
required_substring = trivial_substrings.front().first;
|
||||
required_substring_is_prefix = trivial_substrings.front().second == 0;
|
||||
required_substring.literal = trivial_substrings.front().first;
|
||||
required_substring.prefix = trivial_substrings.front().second == 0;
|
||||
required_substring.suffix = true;
|
||||
}
|
||||
|
||||
/// if it is xxx|xxx|xxx, we should call the next xxx|xxx recursively and collect the result.
|
||||
if (has_alternative_on_depth_0)
|
||||
{
|
||||
/// compare the quality of required substring and alternatives and choose the better one.
|
||||
if (shortest_literal_length(global_alternatives) < required_substring.literal.size())
|
||||
global_alternatives = {required_substring};
|
||||
Literals next_alternatives;
|
||||
/// this two vals are useless, xxx|xxx cannot be trivial nor prefix.
|
||||
bool next_is_trivial = true;
|
||||
pos = analyzeImpl(regexp, pos, required_substring, next_is_trivial, next_alternatives);
|
||||
/// For xxx|xxx|xxx, we only conbine the alternatives and return a empty required_substring.
|
||||
if (next_alternatives.empty() || shortest_literal_length(next_alternatives) < required_substring.literal.size())
|
||||
{
|
||||
global_alternatives.push_back(required_substring);
|
||||
}
|
||||
else
|
||||
{
|
||||
global_alternatives.insert(global_alternatives.end(), next_alternatives.begin(), next_alternatives.end());
|
||||
}
|
||||
required_substring.clear();
|
||||
}
|
||||
|
||||
return pos;
|
||||
|
||||
/* std::cerr
|
||||
<< "regexp: " << regexp
|
||||
<< ", is_trivial: " << is_trivial
|
||||
@ -282,12 +400,31 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
<< ", required_substring_is_prefix: " << required_substring_is_prefix
|
||||
<< std::endl;*/
|
||||
}
|
||||
}
|
||||
|
||||
template <bool thread_safe>
|
||||
void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
std::string_view regexp_,
|
||||
std::string & required_substring,
|
||||
bool & is_trivial,
|
||||
bool & required_substring_is_prefix,
|
||||
std::vector<std::string> & alternatives)
|
||||
{
|
||||
Literals alternative_literals;
|
||||
Literal required_literal;
|
||||
analyzeImpl(regexp_, regexp_.data(), required_literal, is_trivial, alternative_literals);
|
||||
required_substring = std::move(required_literal.literal);
|
||||
required_substring_is_prefix = required_literal.prefix;
|
||||
for (auto & lit : alternative_literals)
|
||||
alternatives.push_back(std::move(lit.literal));
|
||||
}
|
||||
|
||||
template <bool thread_safe>
|
||||
OptimizedRegularExpressionImpl<thread_safe>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
|
||||
{
|
||||
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
|
||||
std::vector<std::string> alternativesDummy; /// this vector extracts patterns a,b,c from pattern (a|b|c). for now it's not used.
|
||||
analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix, alternativesDummy);
|
||||
|
||||
|
||||
/// Just three following options are supported
|
||||
if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
|
||||
|
@ -95,6 +95,15 @@ public:
|
||||
out_required_substring_is_prefix = required_substring_is_prefix;
|
||||
}
|
||||
|
||||
/// analyze function will extract the longest string literal or multiple alternative string literals from regexp for pre-checking if
|
||||
/// a string contains the string literal(s). If not, we can tell this string can never match the regexp.
|
||||
static void analyze(
|
||||
std::string_view regexp_,
|
||||
std::string & required_substring,
|
||||
bool & is_trivial,
|
||||
bool & required_substring_is_prefix,
|
||||
std::vector<std::string> & alternatives);
|
||||
|
||||
private:
|
||||
bool is_trivial;
|
||||
bool required_substring_is_prefix;
|
||||
@ -104,8 +113,6 @@ private:
|
||||
std::optional<DB::ASCIICaseInsensitiveStringSearcher> case_insensitive_substring_searcher;
|
||||
std::unique_ptr<RegexType> re2;
|
||||
unsigned number_of_subpatterns;
|
||||
|
||||
static void analyze(std::string_view regexp_, std::string & required_substring, bool & is_trivial, bool & required_substring_is_prefix);
|
||||
};
|
||||
|
||||
using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;
|
||||
|
46
src/Common/tests/gtest_optimize_re.cpp
Normal file
46
src/Common/tests/gtest_optimize_re.cpp
Normal file
@ -0,0 +1,46 @@
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include <Common/OptimizedRegularExpression.h>
|
||||
|
||||
TEST(OptimizeRE, analyze)
|
||||
{
|
||||
auto test_f = [](const std::string & regexp, const std::string & answer, std::vector<std::string> expect_alternatives = {}, bool trival_expected = false)
|
||||
{
|
||||
std::string required;
|
||||
bool is_trivial;
|
||||
bool is_prefix;
|
||||
std::vector<std::string> alternatives;
|
||||
OptimizedRegularExpression::analyze(regexp, required, is_trivial, is_prefix, alternatives);
|
||||
std::cerr << regexp << std::endl;
|
||||
EXPECT_EQ(required, answer);
|
||||
EXPECT_EQ(alternatives, expect_alternatives);
|
||||
EXPECT_EQ(is_trivial, trival_expected);
|
||||
};
|
||||
test_f("abc", "abc", {}, true);
|
||||
test_f("c([^k]*)de", "");
|
||||
test_f("abc(de)fg", "abcdefg");
|
||||
test_f("abc(de|xyz)fg", "abc", {"abcdefg", "abcxyzfg"});
|
||||
test_f("abc(de?f|xyz)fg", "abc", {"abcd", "abcxyzfg"});
|
||||
test_f("abc|fgk|xyz", "", {"abc","fgk", "xyz"});
|
||||
test_f("(abc)", "abc");
|
||||
test_f("(abc|fgk)", "", {"abc","fgk"});
|
||||
test_f("(abc|fgk)(e|f|zkh|)", "", {"abc","fgk"});
|
||||
test_f("abc(abc|fg)xyzz", "xyzz", {"abcabcxyzz","abcfgxyzz"});
|
||||
test_f("abc[k]xyzz", "xyzz");
|
||||
test_f("(abc[k]xyzz)", "xyzz");
|
||||
test_f("abc((de)fg(hi))jk", "abcdefghijk");
|
||||
test_f("abc((?:de)fg(?:hi))jk", "abcdefghijk");
|
||||
test_f("abc((de)fghi+zzz)jk", "abcdefghi");
|
||||
test_f("abc((de)fg(hi))?jk", "abc");
|
||||
test_f("abc((de)fghi?zzz)jk", "abcdefgh");
|
||||
test_f("abc(*cd)jk", "cdjk");
|
||||
test_f(R"(abc(de|xyz|(\{xx\}))fg)", "abc", {"abcdefg", "abcxyzfg", "abc{xx}fg"});
|
||||
test_f("abc(abc|fg)?xyzz", "xyzz");
|
||||
test_f("abc(abc|fg){0,1}xyzz", "xyzz");
|
||||
test_f("abc(abc|fg)xyzz|bcdd?k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bc"});
|
||||
test_f("abc(abc|fg)xyzz|bc(dd?x|kk?y|(f))k|bc(f|g|h?)z", "", {"abcabcxyzz", "abcfgxyzz", "bcd", "bck", "bcfk", "bc"});
|
||||
test_f("((?:abc|efg|xyz)/[a-zA-Z0-9]{1-50})(/?[^ ]*|)", "", {"abc/", "efg/", "xyz/"});
|
||||
test_f(R"([Bb]ai[Dd]u[Ss]pider(?:-[A-Za-z]{1,30})(?:-[A-Za-z]{1,30}|)|bingbot|\bYeti(?:-[a-z]{1,30}|)|Catchpoint(?: bot|)|[Cc]harlotte|Daumoa(?:-feedfetcher|)|(?:[a-zA-Z]{1,30}-|)Googlebot(?:-[a-zA-Z]{1,30}|))", "", {"pider-", "bingbot", "Yeti-", "Yeti", "Catchpoint bot", "Catchpoint", "harlotte", "Daumoa-feedfetcher", "Daumoa", "-Googlebot", "Googlebot"});
|
||||
test_f("abc|(:?xx|yy|zz|x?)def", "", {"abc", "def"});
|
||||
test_f("abc|(:?xx|yy|zz|x?){1,2}def", "", {"abc", "def"});
|
||||
}
|
@ -934,7 +934,7 @@ class IColumn;
|
||||
M(Bool, input_format_bson_skip_fields_with_unsupported_types_in_schema_inference, false, "Skip fields with unsupported types while schema inference for format BSON.", 0) \
|
||||
\
|
||||
M(Bool, regexp_dict_allow_other_sources, false, "Allow regexp_tree dictionary to use sources other than yaml source.", 0) \
|
||||
M(Bool, regexp_dict_allow_hyperscan, false, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
|
||||
M(Bool, regexp_dict_allow_hyperscan, true, "Allow regexp_tree dictionary using Hyperscan library.", 0) \
|
||||
|
||||
// End of FORMAT_FACTORY_SETTINGS
|
||||
// Please add settings non-related to formats into the COMMON_SETTINGS above.
|
||||
|
@ -9,9 +9,10 @@
|
||||
#include <Poco/Logger.h>
|
||||
#include <Poco/RegularExpression.h>
|
||||
|
||||
#include "Common/Exception.h"
|
||||
#include <Common/ArenaUtils.h>
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/logger_useful.h>
|
||||
#include <Common/OptimizedRegularExpression.h>
|
||||
#include <Core/ColumnsWithTypeAndName.h>
|
||||
#include <DataTypes/DataTypeString.h>
|
||||
#include <DataTypes/DataTypesNumber.h>
|
||||
@ -34,6 +35,7 @@
|
||||
|
||||
#if USE_VECTORSCAN
|
||||
# include <hs.h>
|
||||
# include <hs_compile.h>
|
||||
#endif
|
||||
|
||||
namespace DB
|
||||
@ -46,6 +48,7 @@ namespace ErrorCodes
|
||||
extern const int HYPERSCAN_CANNOT_SCAN_TEXT;
|
||||
extern const int UNSUPPORTED_METHOD;
|
||||
extern const int INCORRECT_DICTIONARY_DEFINITION;
|
||||
extern const int LOGICAL_ERROR;
|
||||
}
|
||||
|
||||
const std::string kRegExp = "regexp";
|
||||
@ -172,10 +175,6 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
|
||||
auto keys_column = block.getByName(kKeys).column;
|
||||
auto values_column = block.getByName(kValues).column;
|
||||
|
||||
#ifdef USE_VECTORSCAN
|
||||
SlowWithHyperscanChecker checker;
|
||||
#endif
|
||||
|
||||
size_t size = block.rows();
|
||||
for (size_t i = 0; i < size; i++)
|
||||
{
|
||||
@ -219,12 +218,36 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
|
||||
}
|
||||
}
|
||||
regex_nodes.emplace(id, node);
|
||||
|
||||
#if USE_VECTORSCAN
|
||||
if (use_vectorscan && !checker.isSlow(regex))
|
||||
String required_substring;
|
||||
bool is_trivial, required_substring_is_prefix;
|
||||
std::vector<std::string> alternatives;
|
||||
|
||||
if (use_vectorscan)
|
||||
OptimizedRegularExpression::analyze(regex, required_substring, is_trivial, required_substring_is_prefix, alternatives);
|
||||
|
||||
for (auto & alter : alternatives)
|
||||
{
|
||||
simple_regexps.push_back(regex);
|
||||
if (alter.size() < 3)
|
||||
{
|
||||
alternatives.clear();
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!required_substring.empty())
|
||||
{
|
||||
simple_regexps.push_back(required_substring);
|
||||
regexp_ids.push_back(id);
|
||||
}
|
||||
else if (!alternatives.empty())
|
||||
{
|
||||
for (auto & alternative : alternatives)
|
||||
{
|
||||
simple_regexps.push_back(alternative);
|
||||
regexp_ids.push_back(id);
|
||||
}
|
||||
}
|
||||
else
|
||||
#endif
|
||||
complex_regexp_nodes.push_back(node);
|
||||
@ -284,20 +307,50 @@ void RegExpTreeDictionary::loadData()
|
||||
use_vectorscan = false;
|
||||
if (!use_vectorscan)
|
||||
return;
|
||||
#if USE_VECTORSCAN
|
||||
try
|
||||
|
||||
#if USE_VECTORSCAN
|
||||
std::vector<const char *> patterns;
|
||||
std::vector<unsigned int> flags;
|
||||
std::vector<size_t> lengths;
|
||||
|
||||
for (const std::string & simple_regexp : simple_regexps)
|
||||
{
|
||||
std::vector<std::string_view> regexps_views(simple_regexps.begin(), simple_regexps.end());
|
||||
hyperscan_regex = MultiRegexps::getOrSet<true, false>(regexps_views, std::nullopt);
|
||||
hyperscan_regex->get();
|
||||
patterns.push_back(simple_regexp.data());
|
||||
lengths.push_back(simple_regexp.size());
|
||||
flags.push_back(HS_FLAG_SINGLEMATCH);
|
||||
}
|
||||
catch (Exception & e)
|
||||
|
||||
hs_database_t * db = nullptr;
|
||||
hs_compile_error_t * compile_error;
|
||||
|
||||
std::unique_ptr<unsigned int[]> ids;
|
||||
ids.reset(new unsigned int[patterns.size()]);
|
||||
for (size_t i = 0; i < patterns.size(); i++)
|
||||
ids[i] = static_cast<unsigned>(i+1);
|
||||
|
||||
hs_error_t err = hs_compile_lit_multi(patterns.data(), flags.data(), ids.get(), lengths.data(), static_cast<unsigned>(patterns.size()), HS_MODE_BLOCK, nullptr, &db, &compile_error);
|
||||
origin_db = (db);
|
||||
if (err != HS_SUCCESS)
|
||||
{
|
||||
/// Some compile errors will be thrown as LOGICAL ERROR and cause crash, e.g. empty expression or expressions are too large.
|
||||
/// We catch the error here and rethrow again.
|
||||
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "Error occurs when compiling regular expressions, reason: {}", e.message());
|
||||
/// CompilerError is a unique_ptr, so correct memory free after the exception is thrown.
|
||||
MultiRegexps::CompilerErrorPtr error(compile_error);
|
||||
|
||||
if (error->expression < 0)
|
||||
throw Exception::createRuntime(ErrorCodes::LOGICAL_ERROR, String(error->message));
|
||||
else
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS, "Pattern '{}' failed with error '{}'", patterns[error->expression], String(error->message));
|
||||
}
|
||||
#endif
|
||||
|
||||
/// We allocate the scratch space only once, then copy it across multiple threads with hs_clone_scratch
|
||||
/// function which is faster than allocating scratch space each time in each thread.
|
||||
hs_scratch_t * scratch = nullptr;
|
||||
err = hs_alloc_scratch(db, &scratch);
|
||||
origin_scratch.reset(scratch);
|
||||
/// If not HS_SUCCESS, it is guaranteed that the memory would not be allocated for scratch.
|
||||
if (err != HS_SUCCESS)
|
||||
throw Exception(ErrorCodes::CANNOT_ALLOCATE_MEMORY, "Could not allocate scratch space for vectorscan");
|
||||
#endif
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -396,47 +449,70 @@ bool RegExpTreeDictionary::setAttributes(
|
||||
return attributes_to_set.size() == attributes.size();
|
||||
}
|
||||
|
||||
namespace
|
||||
/// a temp struct to store all the matched result.
|
||||
struct MatchContext
|
||||
{
|
||||
struct MatchContext
|
||||
std::set<UInt64> matched_idx_set;
|
||||
std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
|
||||
|
||||
const std::vector<UInt64> & regexp_ids ;
|
||||
const std::unordered_map<UInt64, UInt64> & topology_order;
|
||||
const char * data;
|
||||
size_t length;
|
||||
const std::map<UInt64, RegExpTreeDictionary::RegexTreeNodePtr> & regex_nodes;
|
||||
|
||||
size_t pre_match_counter = 0;
|
||||
size_t match_counter = 0;
|
||||
|
||||
MatchContext(
|
||||
const std::vector<UInt64> & regexp_ids_,
|
||||
const std::unordered_map<UInt64, UInt64> & topology_order_,
|
||||
const char * data_, size_t length_,
|
||||
const std::map<UInt64, RegExpTreeDictionary::RegexTreeNodePtr> & regex_nodes_
|
||||
)
|
||||
: regexp_ids(regexp_ids_),
|
||||
topology_order(topology_order_),
|
||||
data(data_),
|
||||
length(length_),
|
||||
regex_nodes(regex_nodes_)
|
||||
{}
|
||||
|
||||
[[maybe_unused]]
|
||||
void insertIdx(unsigned int idx)
|
||||
{
|
||||
std::set<UInt64> matched_idx_set;
|
||||
std::vector<std::pair<UInt64, UInt64>> matched_idx_sorted_list;
|
||||
|
||||
const std::vector<UInt64> & regexp_ids ;
|
||||
const std::unordered_map<UInt64, UInt64> & topology_order;
|
||||
|
||||
MatchContext(const std::vector<UInt64> & regexp_ids_, const std::unordered_map<UInt64, UInt64> & topology_order_)
|
||||
: regexp_ids(regexp_ids_), topology_order(topology_order_) {}
|
||||
|
||||
[[maybe_unused]]
|
||||
void insertIdx(unsigned int idx)
|
||||
UInt64 node_id = regexp_ids[idx-1];
|
||||
pre_match_counter++;
|
||||
if (!regex_nodes.at(node_id)->match(data, length))
|
||||
{
|
||||
UInt64 node_id = regexp_ids[idx-1];
|
||||
UInt64 topological_order = topology_order.at(node_id);
|
||||
matched_idx_set.emplace(node_id);
|
||||
matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
|
||||
return;
|
||||
}
|
||||
match_counter++;
|
||||
matched_idx_set.emplace(node_id);
|
||||
|
||||
void insertNodeID(UInt64 id)
|
||||
{
|
||||
UInt64 topological_order = topology_order.at(id);
|
||||
matched_idx_set.emplace(id);
|
||||
matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
|
||||
}
|
||||
UInt64 topological_order = topology_order.at(node_id);
|
||||
matched_idx_sorted_list.push_back(std::make_pair(topological_order, node_id));
|
||||
}
|
||||
|
||||
/// Sort by topological order, which indicates the matching priorities.
|
||||
void sort()
|
||||
{
|
||||
std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
|
||||
}
|
||||
[[maybe_unused]]
|
||||
void insertNodeID(UInt64 id)
|
||||
{
|
||||
matched_idx_set.emplace(id);
|
||||
|
||||
bool contains(UInt64 idx) const
|
||||
{
|
||||
return matched_idx_set.contains(idx);
|
||||
}
|
||||
};
|
||||
}
|
||||
UInt64 topological_order = topology_order.at(id);
|
||||
matched_idx_sorted_list.push_back(std::make_pair(topological_order, id));
|
||||
}
|
||||
|
||||
/// Sort by topological order, which indicates the matching priorities.
|
||||
void sort()
|
||||
{
|
||||
std::sort(matched_idx_sorted_list.begin(), matched_idx_sorted_list.end());
|
||||
}
|
||||
|
||||
bool contains(UInt64 idx) const
|
||||
{
|
||||
return matched_idx_set.contains(idx);
|
||||
}
|
||||
};
|
||||
|
||||
std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
|
||||
const ColumnString::Chars & keys_data,
|
||||
@ -449,7 +525,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
|
||||
hs_scratch_t * scratch = nullptr;
|
||||
if (use_vectorscan)
|
||||
{
|
||||
hs_error_t err = hs_clone_scratch(hyperscan_regex->get()->getScratch(), &scratch);
|
||||
hs_error_t err = hs_clone_scratch(origin_scratch.get(), &scratch);
|
||||
|
||||
if (err != HS_SUCCESS)
|
||||
{
|
||||
@ -476,11 +552,14 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
|
||||
auto key_offset = keys_offsets[key_idx];
|
||||
UInt64 length = key_offset - offset - 1;
|
||||
|
||||
MatchContext match_result(regexp_ids, topology_order);
|
||||
const char * begin = reinterpret_cast<const char *>(keys_data.data()) + offset;
|
||||
|
||||
MatchContext match_result(regexp_ids, topology_order, begin, length, regex_nodes);
|
||||
|
||||
#if USE_VECTORSCAN
|
||||
if (use_vectorscan)
|
||||
{
|
||||
/// pre-select all the possible matches
|
||||
auto on_match = [](unsigned int id,
|
||||
unsigned long long /* from */, // NOLINT
|
||||
unsigned long long /* to */, // NOLINT
|
||||
@ -490,8 +569,9 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
|
||||
static_cast<MatchContext *>(context)->insertIdx(id);
|
||||
return 0;
|
||||
};
|
||||
|
||||
hs_error_t err = hs_scan(
|
||||
hyperscan_regex->get()->getDB(),
|
||||
origin_db,
|
||||
reinterpret_cast<const char *>(keys_data.data()) + offset,
|
||||
static_cast<unsigned>(length),
|
||||
0,
|
||||
@ -501,6 +581,7 @@ std::unordered_map<String, ColumnPtr> RegExpTreeDictionary::match(
|
||||
|
||||
if (err != HS_SUCCESS)
|
||||
throw Exception(ErrorCodes::HYPERSCAN_CANNOT_SCAN_TEXT, "Failed to scan data with vectorscan");
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -33,6 +33,7 @@ namespace ErrorCodes
|
||||
|
||||
class RegExpTreeDictionary : public IDictionary
|
||||
{
|
||||
friend struct MatchContext;
|
||||
public:
|
||||
struct Configuration
|
||||
{
|
||||
@ -162,6 +163,8 @@ private:
|
||||
std::unordered_map<UInt64, UInt64> topology_order;
|
||||
#if USE_VECTORSCAN
|
||||
MultiRegexps::DeferredConstructedRegexpsPtr hyperscan_regex;
|
||||
MultiRegexps::ScratchPtr origin_scratch;
|
||||
hs_database_t* origin_db;
|
||||
#endif
|
||||
|
||||
Poco::Logger * logger;
|
||||
|
Loading…
Reference in New Issue
Block a user