mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 23:52:03 +00:00
Merge pull request #5191 from yandex/regexp_extraction_fix
Regexp extraction fix for small prefixes
This commit is contained in:
commit
324dcf0084
@ -1,7 +1,6 @@
|
||||
#include <Common/Exception.h>
|
||||
#include <Common/OptimizedRegularExpression.h>
|
||||
|
||||
|
||||
#define MIN_LENGTH_FOR_STRSTR 3
|
||||
#define MAX_SUBPATTERNS 5
|
||||
|
||||
@ -214,23 +213,38 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
|
||||
/** We choose the non-alternative substring of the maximum length, among the prefixes,
|
||||
* or a non-alternative substring of maximum length.
|
||||
*/
|
||||
|
||||
/// Tuning for typical usage domain
|
||||
auto tuning_strings_condition = [](const std::string & str)
|
||||
{
|
||||
return str != "://" && str != "http://" && str != "www" && str != "Windows ";
|
||||
};
|
||||
size_t max_length = 0;
|
||||
Substrings::const_iterator candidate_it = trivial_substrings.begin();
|
||||
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
|
||||
{
|
||||
if (((it->second == 0 && candidate_it->second != 0)
|
||||
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
|
||||
/// Tuning for typical usage domain
|
||||
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
|
||||
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
|
||||
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
|
||||
&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
|
||||
&& tuning_strings_condition(it->first))
|
||||
{
|
||||
max_length = it->first.size();
|
||||
candidate_it = it;
|
||||
}
|
||||
}
|
||||
|
||||
/// If prefix is small, it won't be chosen
|
||||
if (max_length < MIN_LENGTH_FOR_STRSTR)
|
||||
{
|
||||
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
|
||||
{
|
||||
if (it->first.size() > max_length && tuning_strings_condition(it->first))
|
||||
{
|
||||
max_length = it->first.size();
|
||||
candidate_it = it;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (max_length >= MIN_LENGTH_FOR_STRSTR)
|
||||
{
|
||||
required_substring = candidate_it->first;
|
||||
|
Loading…
Reference in New Issue
Block a user