address comments and add one more test

This commit is contained in:
Han Fei 2023-02-06 17:26:20 +01:00
parent baa345fa64
commit eb76041312
3 changed files with 31 additions and 9 deletions

View File

@ -157,10 +157,10 @@ namespace
/// hyper scan is not good at processing regex containing {0, 200}
/// This will make re compilation slow and failed. So we select this heavy regular expressions and
/// process it with re2.
struct ComplexRegexChecker
struct RegexChecker
{
re2_st::RE2 searcher;
ComplexRegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {}
RegexChecker() : searcher(R"(\{([\d]+),([\d]+)\})") {}
static bool isFigureLargerThanFifty(const String & str)
try
@ -174,15 +174,15 @@ namespace
}
[[maybe_unused]]
bool check(const String & data) const
bool isSimpleRegex(const String & regex) const
{
re2_st::StringPiece haystack(data.data(), data.size());
re2_st::StringPiece haystack(regex.data(), regex.size());
re2_st::StringPiece matches[10];
size_t start_pos = 0;
while (start_pos < data.size())
while (start_pos < regex.size())
{
if (searcher.Match(haystack, start_pos, data.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10))
if (searcher.Match(haystack, start_pos, regex.size(), re2_st::RE2::Anchor::UNANCHORED, matches, 10))
{
const auto & match = matches[0];
start_pos += match.length();
@ -207,7 +207,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
auto keys_column = block.getByName(kKeys).column;
auto values_column = block.getByName(kValues).column;
ComplexRegexChecker checker;
RegexChecker checker;
size_t size = block.rows();
for (size_t i = 0; i < size; i++)
@ -253,7 +253,7 @@ void RegExpTreeDictionary::initRegexNodes(Block & block)
}
regex_nodes.emplace(id, node);
#if USE_VECTORSCAN
if (use_vectorscan && !checker.check(regex))
if (use_vectorscan && !checker.isSimpleRegex(regex))
{
simple_regexps.push_back(regex);
regexp_ids.push_back(id);
@ -312,6 +312,9 @@ void RegExpTreeDictionary::loadData()
if (simple_regexps.empty() && complex_regexp_nodes.empty())
throw Exception(ErrorCodes::INCORRECT_DICTIONARY_DEFINITION, "There are no available regular expression. Please check your config");
LOG_INFO(logger, "There are {} simple regexps and {} complex regexps", simple_regexps.size(), complex_regexp_nodes.size());
/// If all the regexps cannot work with hyperscan, we should set this flag off to avoid exceptions.
if (simple_regexps.empty())
use_vectorscan = false;
if (!use_vectorscan)
return;
#if USE_VECTORSCAN

View File

@ -2,4 +2,6 @@
('Android','12')
('Android','default')
('Android','default')
('BlackBerry WebKit','10.0')
('BlackBerry WebKit','1.0')
(true,'61f0c404-5cb3-11e7-907b-a6006ad3dba0','2023-01-01','2023-01-01 01:01:01',[1,2,3,-1,-2,-3])

View File

@ -42,7 +42,8 @@ create dictionary regexp_dict1
PRIMARY KEY(regexp)
SOURCE(YAMLRegExpTree(PATH '$yaml'))
LIFETIME(0)
LAYOUT(regexp_tree);
LAYOUT(regexp_tree)
SETTINGS(regexp_dict_allow_hyperscan = true);
select dictGet('regexp_dict1', ('name', 'version'), 'Linux/123.45.67 tlinux');
select dictGet('regexp_dict1', ('name', 'version'), '31/tclwebkit1024');
@ -84,6 +85,22 @@ EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1; -- { serverError 318 }
"
cat > "$yaml" <<EOL
- name: BlackBerry WebKit
regexp: (PlayBook).{1,200}RIM Tablet OS (\d+)\.(\d+)\.(\d+)
version: '\2.\3'
- name: BlackBerry WebKit
regexp: (Black[bB]erry|BB10).{1,200}Version/(\d+)\.(\d+)\.(\d+)
version: '\2.\3'
EOL
$CLICKHOUSE_CLIENT -n --query="
system reload dictionary regexp_dict1;
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (BB10; Touch) AppleWebKit/537.3+ (KHTML, like Gecko) Version/10.0.9.388 Mobile Safari/537.3+');
select dictGet('regexp_dict1', ('name', 'version'), 'Mozilla/5.0 (PlayBook; U; RIM Tablet OS 1.0.0; en-US) AppleWebKit/534.8+ (KHTML, like Gecko) Version/0.0.1 Safari/534.8+');
"
cat > "$yaml" <<EOL
- regexp: 'abc'
col_bool: 'true'