Addition to prev. revision [#CLICKHOUSE-2].

2024-11-10 01:25:21 +00:00 · 2017-05-09 22:45:21 -04:00 · 2017-05-09 22:45:21 -04:00 · ff02af9804
commit ff02af9804
parent 7fc3d90035
3 changed files with 351 additions and 349 deletions
--- a/cmake/dbms_glob_sources.cmake
+++ b/cmake/dbms_glob_sources.cmake
@ -4,7 +4,7 @@ macro(add_glob cur_list)
 endmacro()

 macro(add_headers_and_sources prefix common_path)
-    add_glob(${prefix}_headers RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h ${common_path}/*.inl)
+    add_glob(${prefix}_headers RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${common_path}/*.h)
    add_glob(${prefix}_sources ${common_path}/*.cpp ${common_path}/*.h)
 endmacro()

--- a/dbms/src/Common/OptimizedRegularExpression.h
+++ b/dbms/src/Common/OptimizedRegularExpression.h
@ -26,6 +26,8 @@
  * - if in most calls, the regular expression does not match;
  * - if the regular expression is compatible with the re2 engine;
  * - you can use at your own risk, since, probably, not all cases are taken into account.
+  *
+  * NOTE: Multi-character metasymbols such as \Pl are handled incorrectly.
  */

 namespace OptimizedRegularExpressionDetails
@ -105,4 +107,4 @@ private:

 using OptimizedRegularExpression = OptimizedRegularExpressionImpl<true>;

-#include "OptimizedRegularExpression.inl"
+#include "OptimizedRegularExpression.inl.h"
--- a/dbms/src/Common/OptimizedRegularExpression.inl.h
+++ b/dbms/src/Common/OptimizedRegularExpression.inl.h
@ -10,420 +10,420 @@

 template <bool b>
 void OptimizedRegularExpressionImpl<b>::analyze(
-	const std::string & regexp,
-	std::string & required_substring,
-	bool & is_trivial,
-	bool & required_substring_is_prefix)
+    const std::string & regexp,
+    std::string & required_substring,
+    bool & is_trivial,
+    bool & required_substring_is_prefix)
 {
-	/** The expression is trivial if all the metacharacters in it are escaped.
- 	  * The non-alternative string is
- 	  *  a string outside parentheses,
- 	  *  in which all metacharacters are escaped,
- 	  *  and also if there are no '|' outside the brackets,
- 	  *  and also avoid substrings of the form `http://` or `www`.
-	  */
-	const char * begin = regexp.data();
-	const char * pos = begin;
-	const char * end = regexp.data() + regexp.size();
-	int depth = 0;
-	is_trivial = true;
-	required_substring_is_prefix = false;
-	required_substring.clear();
-	bool has_alternative_on_depth_0 = false;
+    /** The expression is trivial if all the metacharacters in it are escaped.
+      * The non-alternative string is
+      *  a string outside parentheses,
+      *  in which all metacharacters are escaped,
+      *  and also if there are no '|' outside the brackets,
+      *  and also avoid substrings of the form `http://` or `www`.
+      */
+    const char * begin = regexp.data();
+    const char * pos = begin;
+    const char * end = regexp.data() + regexp.size();
+    int depth = 0;
+    is_trivial = true;
+    required_substring_is_prefix = false;
+    required_substring.clear();
+    bool has_alternative_on_depth_0 = false;

- 	/// Substring with a position.
-	typedef std::pair<std::string, size_t> Substring;
+    /// Substring with a position.
+    typedef std::pair<std::string, size_t> Substring;

-	typedef std::vector<Substring> Substrings;
-	Substrings trivial_substrings(1);
-	Substring * last_substring = &trivial_substrings.back();
+    typedef std::vector<Substring> Substrings;
+    Substrings trivial_substrings(1);
+    Substring * last_substring = &trivial_substrings.back();

-	bool in_curly_braces = false;
-	bool in_square_braces = false;
+    bool in_curly_braces = false;
+    bool in_square_braces = false;

-	while (pos != end)
-	{
-		switch (*pos)
-		{
-			case '\0':
-				pos = end;
-				break;
+    while (pos != end)
+    {
+        switch (*pos)
+        {
+            case '\0':
+                pos = end;
+                break;

-			case '\\':
-			{
-				++pos;
-				if (pos == end)
-					break;
+            case '\\':
+            {
+                ++pos;
+                if (pos == end)
+                    break;

-				switch (*pos)
-				{
-					case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
-						if (depth == 0 && !in_curly_braces && !in_square_braces)
-						{
-							if (last_substring->first.empty())
-								last_substring->second = pos - begin;
-							last_substring->first.push_back(*pos);
-						}
-						break;
-					default:
- 						/// all other escape sequences are not supported
-						is_trivial = false;
-						if (!last_substring->first.empty())
-						{
-							trivial_substrings.resize(trivial_substrings.size() + 1);
-							last_substring = &trivial_substrings.back();
-						}
-						break;
-				}
+                switch (*pos)
+                {
+                    case '|': case '(': case ')': case '^': case '$': case '.': case '[': case '?': case '*': case '+': case '{':
+                        if (depth == 0 && !in_curly_braces && !in_square_braces)
+                        {
+                            if (last_substring->first.empty())
+                                last_substring->second = pos - begin;
+                            last_substring->first.push_back(*pos);
+                        }
+                        break;
+                    default:
+                        /// all other escape sequences are not supported
+                        is_trivial = false;
+                        if (!last_substring->first.empty())
+                        {
+                            trivial_substrings.resize(trivial_substrings.size() + 1);
+                            last_substring = &trivial_substrings.back();
+                        }
+                        break;
+                }

-				++pos;
-				break;
-			}
+                ++pos;
+                break;
+            }

-			case '|':
-				if (depth == 0)
-					has_alternative_on_depth_0 = true;
-				is_trivial = false;
-				if (!in_square_braces && !last_substring->first.empty())
-				{
-					trivial_substrings.resize(trivial_substrings.size() + 1);
-					last_substring = &trivial_substrings.back();
-				}
-				++pos;
-				break;
+            case '|':
+                if (depth == 0)
+                    has_alternative_on_depth_0 = true;
+                is_trivial = false;
+                if (!in_square_braces && !last_substring->first.empty())
+                {
+                    trivial_substrings.resize(trivial_substrings.size() + 1);
+                    last_substring = &trivial_substrings.back();
+                }
+                ++pos;
+                break;

-			case '(':
-				if (!in_square_braces)
-				{
-					++depth;
-					is_trivial = false;
-					if (!last_substring->first.empty())
-					{
-						trivial_substrings.resize(trivial_substrings.size() + 1);
-						last_substring = &trivial_substrings.back();
-					}
-				}
-				++pos;
-				break;
+            case '(':
+                if (!in_square_braces)
+                {
+                    ++depth;
+                    is_trivial = false;
+                    if (!last_substring->first.empty())
+                    {
+                        trivial_substrings.resize(trivial_substrings.size() + 1);
+                        last_substring = &trivial_substrings.back();
+                    }
+                }
+                ++pos;
+                break;

-			case '[':
-				in_square_braces = true;
-				++depth;
-				is_trivial = false;
-				if (!last_substring->first.empty())
-				{
-					trivial_substrings.resize(trivial_substrings.size() + 1);
-					last_substring = &trivial_substrings.back();
-				}
-				++pos;
-				break;
+            case '[':
+                in_square_braces = true;
+                ++depth;
+                is_trivial = false;
+                if (!last_substring->first.empty())
+                {
+                    trivial_substrings.resize(trivial_substrings.size() + 1);
+                    last_substring = &trivial_substrings.back();
+                }
+                ++pos;
+                break;

-			case ']':
-				if (!in_square_braces)
-					goto ordinary;
+            case ']':
+                if (!in_square_braces)
+                    goto ordinary;

-				in_square_braces = false;
-				--depth;
-				is_trivial = false;
-				if (!last_substring->first.empty())
-				{
-					trivial_substrings.resize(trivial_substrings.size() + 1);
-					last_substring = &trivial_substrings.back();
-				}
-				++pos;
-				break;
+                in_square_braces = false;
+                --depth;
+                is_trivial = false;
+                if (!last_substring->first.empty())
+                {
+                    trivial_substrings.resize(trivial_substrings.size() + 1);
+                    last_substring = &trivial_substrings.back();
+                }
+                ++pos;
+                break;

-			case ')':
-				if (!in_square_braces)
-				{
-					--depth;
-					is_trivial = false;
-					if (!last_substring->first.empty())
-					{
-						trivial_substrings.resize(trivial_substrings.size() + 1);
-						last_substring = &trivial_substrings.back();
-					}
-				}
-				++pos;
-				break;
+            case ')':
+                if (!in_square_braces)
+                {
+                    --depth;
+                    is_trivial = false;
+                    if (!last_substring->first.empty())
+                    {
+                        trivial_substrings.resize(trivial_substrings.size() + 1);
+                        last_substring = &trivial_substrings.back();
+                    }
+                }
+                ++pos;
+                break;

-			case '^': case '$': case '.': case '+':
-				is_trivial = false;
-				if (!last_substring->first.empty() && !in_square_braces)
-				{
-					trivial_substrings.resize(trivial_substrings.size() + 1);
-					last_substring = &trivial_substrings.back();
-				}
-				++pos;
-				break;
+            case '^': case '$': case '.': case '+':
+                is_trivial = false;
+                if (!last_substring->first.empty() && !in_square_braces)
+                {
+                    trivial_substrings.resize(trivial_substrings.size() + 1);
+                    last_substring = &trivial_substrings.back();
+                }
+                ++pos;
+                break;

- 			/// Quantifiers that allow a zero number.
-			case '{':
-				in_curly_braces = true;
-			case '?': case '*':
-				is_trivial = false;
-				if (!last_substring->first.empty() && !in_square_braces)
-				{
-					last_substring->first.resize(last_substring->first.size() - 1);
-					trivial_substrings.resize(trivial_substrings.size() + 1);
-					last_substring = &trivial_substrings.back();
-				}
-				++pos;
-				break;
+             /// Quantifiers that allow a zero number.
+            case '{':
+                in_curly_braces = true;
+            case '?': case '*':
+                is_trivial = false;
+                if (!last_substring->first.empty() && !in_square_braces)
+                {
+                    last_substring->first.resize(last_substring->first.size() - 1);
+                    trivial_substrings.resize(trivial_substrings.size() + 1);
+                    last_substring = &trivial_substrings.back();
+                }
+                ++pos;
+                break;

-			case '}':
-				if (!in_curly_braces)
-					goto ordinary;
+            case '}':
+                if (!in_curly_braces)
+                    goto ordinary;

-				in_curly_braces = false;
-				++pos;
-				break;
+                in_curly_braces = false;
+                ++pos;
+                break;

- 			ordinary:   /// Normal, not escaped symbol.
-			default:
-				if (depth == 0 && !in_curly_braces && !in_square_braces)
-				{
-					if (last_substring->first.empty())
-						last_substring->second = pos - begin;
-					last_substring->first.push_back(*pos);
-				}
-				++pos;
-				break;
-		}
-	}
+            ordinary:   /// Normal, not escaped symbol.
+            default:
+                if (depth == 0 && !in_curly_braces && !in_square_braces)
+                {
+                    if (last_substring->first.empty())
+                        last_substring->second = pos - begin;
+                    last_substring->first.push_back(*pos);
+                }
+                ++pos;
+                break;
+        }
+    }

-	if (last_substring && last_substring->first.empty())
-		trivial_substrings.pop_back();
+    if (last_substring && last_substring->first.empty())
+        trivial_substrings.pop_back();

-	if (!is_trivial)
-	{
-		if (!has_alternative_on_depth_0)
-		{
-			/** We choose the non-alternative substring of the maximum length, among the prefixes,
-			  *  or a non-alternative substring of maximum length.
-			  */
-			size_t max_length = 0;
-			Substrings::const_iterator candidate_it = trivial_substrings.begin();
-			for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
-			{
-				if (((it->second == 0 && candidate_it->second != 0)
-						|| ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
- 					/// Tuning for the domain
-					&& (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
-					&& (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
-					&& (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
-					&& (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
-				{
-					max_length = it->first.size();
-					candidate_it = it;
-				}
-			}
+    if (!is_trivial)
+    {
+        if (!has_alternative_on_depth_0)
+        {
+            /** We choose the non-alternative substring of the maximum length, among the prefixes,
+              *  or a non-alternative substring of maximum length.
+              */
+            size_t max_length = 0;
+            Substrings::const_iterator candidate_it = trivial_substrings.begin();
+            for (Substrings::const_iterator it = trivial_substrings.begin(); it != trivial_substrings.end(); ++it)
+            {
+                if (((it->second == 0 && candidate_it->second != 0)
+                        || ((it->second == 0) == (candidate_it->second == 0) && it->first.size() > max_length))
+                    /// Tuning for the domain
+                    && (it->first.size() > strlen("://") || strncmp(it->first.data(), "://", strlen("://")))
+                    && (it->first.size() > strlen("http://") || strncmp(it->first.data(), "http", strlen("http")))
+                    && (it->first.size() > strlen("www.") || strncmp(it->first.data(), "www", strlen("www")))
+                    && (it->first.size() > strlen("Windows ") || strncmp(it->first.data(), "Windows ", strlen("Windows "))))
+                {
+                    max_length = it->first.size();
+                    candidate_it = it;
+                }
+            }

-			if (max_length >= MIN_LENGTH_FOR_STRSTR)
-			{
-				required_substring = candidate_it->first;
-				required_substring_is_prefix = candidate_it->second == 0;
-			}
-		}
-	}
-	else
-	{
-		required_substring = trivial_substrings.front().first;
-		required_substring_is_prefix = trivial_substrings.front().second == 0;
-	}
+            if (max_length >= MIN_LENGTH_FOR_STRSTR)
+            {
+                required_substring = candidate_it->first;
+                required_substring_is_prefix = candidate_it->second == 0;
+            }
+        }
+    }
+    else
+    {
+        required_substring = trivial_substrings.front().first;
+        required_substring_is_prefix = trivial_substrings.front().second == 0;
+    }

-/*	std::cerr
-		<< "regexp: " << regexp
-		<< ", is_trivial: " << is_trivial
-		<< ", required_substring: " << required_substring
-		<< ", required_substring_is_prefix: " << required_substring_is_prefix
-		<< std::endl;*/
+/*    std::cerr
+        << "regexp: " << regexp
+        << ", is_trivial: " << is_trivial
+        << ", required_substring: " << required_substring
+        << ", required_substring_is_prefix: " << required_substring_is_prefix
+        << std::endl;*/
 }


 template <bool b>
 OptimizedRegularExpressionImpl<b>::OptimizedRegularExpressionImpl(const std::string & regexp_, int options)
 {
-	analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);
+    analyze(regexp_, required_substring, is_trivial, required_substring_is_prefix);

- 	/// 3 options are supported
-	if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
-		throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");
+    /// 3 options are supported
+    if (options & (~(RE_CASELESS | RE_NO_CAPTURE | RE_DOT_NL)))
+        throw Poco::Exception("OptimizedRegularExpression: Unsupported option.");

-	is_case_insensitive	= options & RE_CASELESS;
-	bool is_no_capture	= options & RE_NO_CAPTURE;
-	bool is_dot_nl		= options & RE_DOT_NL;
+    is_case_insensitive   = options & RE_CASELESS;
+    bool is_no_capture    = options & RE_NO_CAPTURE;
+    bool is_dot_nl        = options & RE_DOT_NL;

-	number_of_subpatterns = 0;
-	if (!is_trivial)
-	{
- 		/// Compile the re2 regular expression.
-		typename RegexType::Options options;
+    number_of_subpatterns = 0;
+    if (!is_trivial)
+    {
+        /// Compile the re2 regular expression.
+        typename RegexType::Options options;

-		if (is_case_insensitive)
-			options.set_case_sensitive(false);
+        if (is_case_insensitive)
+            options.set_case_sensitive(false);

-		if (is_dot_nl)
-			options.set_dot_nl(true);
+        if (is_dot_nl)
+            options.set_dot_nl(true);

-		re2 = std::make_unique<RegexType>(regexp_, options);
-		if (!re2->ok())
-			throw Poco::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error());
+        re2 = std::make_unique<RegexType>(regexp_, options);
+        if (!re2->ok())
+            throw Poco::Exception("OptimizedRegularExpression: cannot compile re2: " + regexp_ + ", error: " + re2->error());

-		if (!is_no_capture)
-		{
-			number_of_subpatterns = re2->NumberOfCapturingGroups();
-			if (number_of_subpatterns > MAX_SUBPATTERNS)
-				throw Poco::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_);
-		}
-	}
+        if (!is_no_capture)
+        {
+            number_of_subpatterns = re2->NumberOfCapturingGroups();
+            if (number_of_subpatterns > MAX_SUBPATTERNS)
+                throw Poco::Exception("OptimizedRegularExpression: too many subpatterns in regexp: " + regexp_);
+        }
+    }
 }


 template <bool b>
 bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size) const
 {
-	if (is_trivial)
-	{
-		if (is_case_insensitive)
-			return nullptr != strcasestr(subject, required_substring.data());
-		else
-			return nullptr != strstr(subject, required_substring.data());
-	}
-	else
-	{
-		if (!required_substring.empty())
-		{
-			const char * pos;
-			if (is_case_insensitive)
-				pos = strcasestr(subject, required_substring.data());
-			else
-				pos = strstr(subject, required_substring.data());
+    if (is_trivial)
+    {
+        if (is_case_insensitive)
+            return nullptr != strcasestr(subject, required_substring.data());
+        else
+            return nullptr != strstr(subject, required_substring.data());
+    }
+    else
+    {
+        if (!required_substring.empty())
+        {
+            const char * pos;
+            if (is_case_insensitive)
+                pos = strcasestr(subject, required_substring.data());
+            else
+                pos = strstr(subject, required_substring.data());

-			if (nullptr == pos)
-				return 0;
-		}
+            if (nullptr == pos)
+                return 0;
+        }

-		return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
-	}
+        return re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, nullptr, 0);
+    }
 }


 template <bool b>
 bool OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size, Match & match) const
 {
-	if (is_trivial)
-	{
-		const char * pos;
-		if (is_case_insensitive)
-			pos = strcasestr(subject, required_substring.data());
-		else
-			pos = strstr(subject, required_substring.data());
+    if (is_trivial)
+    {
+        const char * pos;
+        if (is_case_insensitive)
+            pos = strcasestr(subject, required_substring.data());
+        else
+            pos = strstr(subject, required_substring.data());

-		if (pos == nullptr)
-			return 0;
-		else
-		{
-			match.offset = pos - subject;
-			match.length = required_substring.size();
-			return 1;
-		}
-	}
-	else
-	{
-		if (!required_substring.empty())
-		{
-			const char * pos;
-			if (is_case_insensitive)
-				pos = strcasestr(subject, required_substring.data());
-			else
-				pos = strstr(subject, required_substring.data());
+        if (pos == nullptr)
+            return 0;
+        else
+        {
+            match.offset = pos - subject;
+            match.length = required_substring.size();
+            return 1;
+        }
+    }
+    else
+    {
+        if (!required_substring.empty())
+        {
+            const char * pos;
+            if (is_case_insensitive)
+                pos = strcasestr(subject, required_substring.data());
+            else
+                pos = strstr(subject, required_substring.data());

-			if (nullptr == pos)
-				return 0;
-		}
+            if (nullptr == pos)
+                return 0;
+        }

-		StringPieceType piece;
+        StringPieceType piece;

-		if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
-			return 0;
-		else
-		{
-			match.offset = piece.data() - subject;
-			match.length = piece.length();
-			return 1;
-		}
-	}
+        if (!RegexType::PartialMatch(StringPieceType(subject, subject_size), *re2, &piece))
+            return 0;
+        else
+        {
+            match.offset = piece.data() - subject;
+            match.length = piece.length();
+            return 1;
+        }
+    }
 }


 template <bool b>
 unsigned OptimizedRegularExpressionImpl<b>::match(const char * subject, size_t subject_size, MatchVec & matches, unsigned limit) const
 {
-	matches.clear();
+    matches.clear();

-	if (limit == 0)
-		return 0;
+    if (limit == 0)
+        return 0;

-	if (limit > number_of_subpatterns + 1)
-		limit = number_of_subpatterns + 1;
+    if (limit > number_of_subpatterns + 1)
+        limit = number_of_subpatterns + 1;

-	if (is_trivial)
-	{
-		const char * pos;
-		if (is_case_insensitive)
-			pos = strcasestr(subject, required_substring.data());
-		else
-			pos = strstr(subject, required_substring.data());
+    if (is_trivial)
+    {
+        const char * pos;
+        if (is_case_insensitive)
+            pos = strcasestr(subject, required_substring.data());
+        else
+            pos = strstr(subject, required_substring.data());

-		if (pos == nullptr)
-			return 0;
-		else
-		{
-			Match match;
-			match.offset = pos - subject;
-			match.length = required_substring.size();
-			matches.push_back(match);
-			return 1;
-		}
-	}
-	else
-	{
-		if (!required_substring.empty())
-		{
-			const char * pos;
-			if (is_case_insensitive)
-				pos = strcasestr(subject, required_substring.data());
-			else
-				pos = strstr(subject, required_substring.data());
+        if (pos == nullptr)
+            return 0;
+        else
+        {
+            Match match;
+            match.offset = pos - subject;
+            match.length = required_substring.size();
+            matches.push_back(match);
+            return 1;
+        }
+    }
+    else
+    {
+        if (!required_substring.empty())
+        {
+            const char * pos;
+            if (is_case_insensitive)
+                pos = strcasestr(subject, required_substring.data());
+            else
+                pos = strstr(subject, required_substring.data());

-			if (nullptr == pos)
-				return 0;
-		}
+            if (nullptr == pos)
+                return 0;
+        }

-		StringPieceType pieces[MAX_SUBPATTERNS];
+        StringPieceType pieces[MAX_SUBPATTERNS];

-		if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))
-			return 0;
-		else
-		{
-			matches.resize(limit);
-			for (size_t i = 0; i < limit; ++i)
-			{
-				if (pieces[i] != nullptr)
-				{
-					matches[i].offset = pieces[i].data() - subject;
-					matches[i].length = pieces[i].length();
-				}
-				else
-				{
-					matches[i].offset = std::string::npos;
-					matches[i].length = 0;
-				}
-			}
-			return limit;
-		}
-	}
+        if (!re2->Match(StringPieceType(subject, subject_size), 0, subject_size, RegexType::UNANCHORED, pieces, limit))
+            return 0;
+        else
+        {
+            matches.resize(limit);
+            for (size_t i = 0; i < limit; ++i)
+            {
+                if (pieces[i] != nullptr)
+                {
+                    matches[i].offset = pieces[i].data() - subject;
+                    matches[i].length = pieces[i].length();
+                }
+                else
+                {
+                    matches[i].offset = std::string::npos;
+                    matches[i].length = 0;
+                }
+            }
+            return limit;
+        }
+    }
 }

 #undef MIN_LENGTH_FOR_STRSTR