Merge pull request #5191 from yandex/regexp_extraction_fix

Regexp extraction fix for small prefixes
This commit is contained in:
alexey-milovidov 2019-05-05 15:16:57 +03:00 committed by GitHub
commit 324dcf0084
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -1,7 +1,6 @@
#include <Common/Exception.h>
#include <Common/OptimizedRegularExpression.h>
#define MIN_LENGTH_FOR_STRSTR 3
#define MAX_SUBPATTERNS 5
@ -214,23 +213,38 @@ void OptimizedRegularExpressionImpl<thread_safe>::analyze(
/** We choose the non-alternative substring of the maximum length, among the prefixes,
* or a non-alternative substring of maximum length.
*/
/// Tuning for typical usage domain
auto tuning_strings_condition = [](const std::string & str)
{
return str != "://" && str != "http://" && str != "www" && str != "Windows ";
};
size_t max_length = 0;
Substrings::const_iterator candidate_it = trivial_substrings.begin();
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (((it->second == 0 && candidate_it->second != 0)
|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
/// Tuning for typical usage domain
&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
&& tuning_strings_condition(it->first))
{
max_length = it->first.size();
candidate_it = it;
}
}
/// If prefix is small, it won't be chosen
if (max_length < MIN_LENGTH_FOR_STRSTR)
{
for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
{
if (it->first.size() > max_length && tuning_strings_condition(it->first))
{
max_length = it->first.size();
candidate_it = it;
}
}
}
if (max_length >= MIN_LENGTH_FOR_STRSTR)
{
required_substring = candidate_it->first;