some changes

This commit is contained in:
stavrolia 2019-08-27 18:20:31 +03:00
parent 1190e866aa
commit f4e0dceddb
4 changed files with 52 additions and 52 deletions

View File

@ -5,56 +5,60 @@
namespace DB
{
/* Because of difference between grep-wildcard-syntax and perl-regexp one we need some transformation of string to use RE2 library for matching.
* It couldn't be one pass because of various configurations of braces in filenames (Linux allow almost any symbols in paths).
* So there are some iterations of escaping and replacements to make correct perl-regexp.
/* Transforms string from grep-wildcard-syntax ("{N..M}", "{a,b,c}" as in remote table function and "*", "?") to perl-regexp for using re2 library fo matching
* with such steps:
* 1) search intervals and enums in {}, replace them by regexp with pipe (expr1|expr2|expr3),
* 2) search and replace "*" and "?".
* Before each search need to escape symbols that we would not search.
*/
std::string makeRegexpPatternFromGlobs(const std::string & initial_str)
std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_globs)
{
std::string first_prepare;
first_prepare.reserve(initial_str.size());
for (const auto & letter : initial_str)
std::string escaped_with_globs;
escaped_with_globs.reserve(initial_str_with_globs.size());
/// Escaping only characters that not used in glob syntax
for (const auto & letter : initial_str_with_globs)
{
if ((letter == '[') || (letter == ']') || (letter == '|') || (letter == '+'))
first_prepare.push_back('\\');
first_prepare.push_back(letter);
if ((letter == '[') || (letter == ']') || (letter == '|') || (letter == '+') || (letter == '-') || (letter == '(') || (letter == ')'))
escaped_with_globs.push_back('\\');
escaped_with_globs.push_back(letter);
}
re2::RE2 char_range(R"(({[^*?/\\]\.\.[^*?/\\]}))");
re2::StringPiece input_for_range(first_prepare);
re2::StringPiece matched_range(first_prepare);
std::string second_prepare;
second_prepare.reserve(first_prepare.size());
re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without {}*,
re2::StringPiece input(escaped_with_globs);
re2::StringPiece matched(escaped_with_globs);
size_t current_index = 0;
size_t pos;
while (RE2::FindAndConsume(&input_for_range, char_range, &matched_range))
std::string almost_regexp;
almost_regexp.reserve(escaped_with_globs.size());
while (RE2::FindAndConsume(&input, enum_or_range, &matched))
{
pos = matched_range.data() - first_prepare.data();
second_prepare += first_prepare.substr(current_index, pos - current_index);
second_prepare.append({'[', matched_range.ToString()[1], '-', matched_range.ToString()[4], ']'});
current_index = input_for_range.data() - first_prepare.data();
std::string buffer = matched.ToString();
almost_regexp.append(escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1));
if (buffer.find(',') == std::string::npos)
{
size_t first_point = buffer.find('.');
std::string first_number = buffer.substr(0, first_point);
std::string second_number = buffer.substr(first_point + 2, buffer.size() - first_point - 2);
size_t range_begin = std::stoull(first_number);
size_t range_end = std::stoull(second_number);
buffer = std::to_string(range_begin);
for (size_t i = range_begin + 1; i <= range_end; ++i)
{
buffer += "|";
buffer += std::to_string(i);
}
}
else
{
std::replace(buffer.begin(), buffer.end(), ',', '|');
}
almost_regexp.append("(" + buffer + ")");
current_index = input.data() - escaped_with_globs.data();
}
second_prepare += first_prepare.substr(current_index);
re2::RE2 enumeration(R"(({[^{}*,]+,[^{}*]*[^{}*,]}))");
re2::StringPiece input_enum(second_prepare);
re2::StringPiece matched_enum(second_prepare);
current_index = 0;
std::string third_prepare;
while (RE2::FindAndConsume(&input_enum, enumeration, &matched_enum))
{
pos = matched_enum.data() - second_prepare.data();
third_prepare.append(second_prepare.substr(current_index, pos - current_index));
std::string buffer = matched_enum.ToString();
buffer[0] = '(';
buffer.back() = ')';
std::replace(buffer.begin(), buffer.end(), ',', '|');
third_prepare.append(buffer);
current_index = input_enum.data() - second_prepare.data();
}
third_prepare += second_prepare.substr(current_index);
almost_regexp += escaped_with_globs.substr(current_index); /////
std::string result;
result.reserve(third_prepare.size());
for (const auto & letter : third_prepare)
result.reserve(almost_regexp.size());
for (const auto & letter : almost_regexp)
{
if ((letter == '?') || (letter == '*'))
{
@ -62,7 +66,7 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str)
if (letter == '?')
continue;
}
if ((letter == '.') || (letter == '{') || (letter == '}'))
if ((letter == '.') || (letter == '{') || (letter == '}') || (letter == '\\'))
result.push_back('\\');
result.push_back(letter);
}

View File

@ -27,10 +27,9 @@
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <boost/filesystem.hpp>
#include <boost/filesystem/operations.hpp>
#include <filesystem>
namespace fs = boost::filesystem;
namespace fs = std::filesystem;
namespace DB
{

View File

@ -32,9 +32,7 @@ def test_strange_filenames(start_cluster):
("p.o.*t.s", "2"),
("b}{r?{ces", "2"),
("b}*ces", "2"),
("b}{r{a..z}{ces", "2"),
("b}.?{t.h", "2"),
("b}.?{t.{a..z}", "2")]
("b}.?{t.h", "2")]
for pattern, value in test_requests:
assert node.query('''
@ -64,8 +62,8 @@ def test_linear_structure(start_cluster):
("file*", "20"),
("a_{file,data}", "4"),
("?_{file,data}", "20"),
("{a..z}_{file,data}", "20"),
("{a..z}?{file,data}", "20"),
("{a,b,c,d,e}_{file,data}", "20"),
("{a,b,c,d,e}?{file,data}", "20"),
("*", "40")]
for pattern, value in test_requests:
@ -106,7 +104,7 @@ def test_deep_structure(start_cluster):
test_requests = [ ("directory{1..5}/big_dir/*", "2002"), ("directory{0..6}/big_dir/*{0..9}{0..9}{0..9}", "2000"),
("?", "0"),
("directory{0..5}/dir{1..3}/file", "10"), ("directory{0..5}/dir?/file", "10"),
("we/need/to/go/deeper/file", "2"), ("*/*/*/*/*/*", "2"), ("we/n{a..z}ed/to/*/deeper/{a..z}{a..z}{a..z}{a..z}", "2"), ("we/need/??/go/deeper/*?*?*?*?*", "2")]
("we/need/to/go/deeper/file", "2"), ("*/*/*/*/*/*", "2"), ("we/need/??/go/deeper/*?*?*?*?*", "2")]
for pattern, value in test_requests:
assert node.query('''

View File

@ -91,7 +91,6 @@ def test_globs_in_read_table(started_cluster):
("dir/*", 1),
("dir/*?*?*?*?*", 1),
("dir/*?*?*?*?*?*", 0),
("dir/*{a..z}*{a..z}*{a..z}*{a..z}*", 1),
("some_dir/*/file", 2),
("some_dir/dir?/*", 2),
("*/*/*", 3),