From 644437cfb8442a01acbd208475a2c2488d9e4af0 Mon Sep 17 00:00:00 2001 From: Konstantin Bogdanov Date: Thu, 28 Nov 2024 01:23:53 +0100 Subject: [PATCH 1/2] Fix parsing a glob with one element --- src/Common/parseGlobs.cpp | 11 ++++++++++- src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp | 2 ++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/Common/parseGlobs.cpp b/src/Common/parseGlobs.cpp index de6caec3149..696a71ed403 100644 --- a/src/Common/parseGlobs.cpp +++ b/src/Common/parseGlobs.cpp @@ -35,7 +35,7 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob } std::string escaped_with_globs = buf_for_escaping.str(); - static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and "," + static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and "," std::string_view input(escaped_with_globs); std::string_view matched; std::ostringstream oss_for_replacing; // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -48,6 +48,15 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob if (!buffer.contains(',')) { + /// No dot or one dot in the filename. This is not a range. + if (buffer.find('.') == std::string::npos || buffer.find('.') == buffer.find_last_of('.')) + { + oss_for_replacing << buffer; + oss_for_replacing << ")"; + current_index = input.data() - escaped_with_globs.data(); + break; + } + size_t range_begin = 0; size_t range_end = 0; char point; diff --git a/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp b/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp index 6e15c0f712c..654767efc7f 100644 --- a/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp +++ b/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp @@ -12,6 +12,8 @@ TEST(Common, makeRegexpPatternFromGlobs) EXPECT_EQ(makeRegexpPatternFromGlobs("*"), "[^/]*"); EXPECT_EQ(makeRegexpPatternFromGlobs("/?"), "/[^/]"); EXPECT_EQ(makeRegexpPatternFromGlobs("/*"), "/[^/]*"); + EXPECT_EQ(makeRegexpPatternFromGlobs("{123}"), "(123)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("{test}"), "(test)"); EXPECT_EQ(makeRegexpPatternFromGlobs("*_{{a,b,c,d}}/?.csv"), "[^/]*_\\{(a|b|c|d)\\}/[^/]\\.csv"); /* Regex Parsing for {..} can have three possible cases 1) The left range width == the right range width From 8f723820521ffa8128c10cdd84e115204c222dde Mon Sep 17 00:00:00 2001 From: Konstantin Bogdanov Date: Thu, 28 Nov 2024 02:52:50 +0100 Subject: [PATCH 2/2] Split in two regexes --- src/Common/parseGlobs.cpp | 95 ++++++++++--------- .../gtest_makeRegexpPatternFromGlobs.cpp | 1 + 2 files changed, 49 insertions(+), 47 deletions(-) diff --git a/src/Common/parseGlobs.cpp b/src/Common/parseGlobs.cpp index 696a71ed403..b785a660285 100644 --- a/src/Common/parseGlobs.cpp +++ b/src/Common/parseGlobs.cpp @@ -35,70 +35,71 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob } std::string escaped_with_globs = buf_for_escaping.str(); - static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and "," - std::string_view input(escaped_with_globs); + static const re2::RE2 range_regex(R"({([\d]+\.\.[\d]+)})"); /// regexp for {M..N}, where M and N - non-negative integers + static const re2::RE2 enum_regex(R"({([^{}*,]+[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3}, expr's should be without "{", "}", "*" and "," + std::string_view matched; + std::string_view input(escaped_with_globs); std::ostringstream oss_for_replacing; // STYLE_CHECK_ALLOW_STD_STRING_STREAM oss_for_replacing.exceptions(std::ios::failbit); size_t current_index = 0; - while (RE2::FindAndConsume(&input, enum_or_range, &matched)) + + while (RE2::FindAndConsume(&input, range_regex, &matched)) { std::string buffer(matched); oss_for_replacing << escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1) << '('; - if (!buffer.contains(',')) + size_t range_begin = 0; + size_t range_end = 0; + char point; + ReadBufferFromString buf_range(buffer); + buf_range >> range_begin >> point >> point >> range_end; + + size_t range_begin_width = buffer.find('.'); + size_t range_end_width = buffer.size() - buffer.find_last_of('.') - 1; + bool leading_zeros = buffer[0] == '0'; + size_t output_width = 0; + + if (range_begin > range_end) //Descending Sequence {20..15} {9..01} { - /// No dot or one dot in the filename. This is not a range. - if (buffer.find('.') == std::string::npos || buffer.find('.') == buffer.find_last_of('.')) - { - oss_for_replacing << buffer; - oss_for_replacing << ")"; - current_index = input.data() - escaped_with_globs.data(); - break; - } + std::swap(range_begin,range_end); + leading_zeros = buffer[buffer.find_last_of('.')+1]=='0'; + std::swap(range_begin_width,range_end_width); + } + if (range_begin_width == 1 && leading_zeros) + output_width = 1; ///Special Case: {0..10} {0..999} + else + output_width = std::max(range_begin_width, range_end_width); - size_t range_begin = 0; - size_t range_end = 0; - char point; - ReadBufferFromString buf_range(buffer); - buf_range >> range_begin >> point >> point >> range_end; - - size_t range_begin_width = buffer.find('.'); - size_t range_end_width = buffer.size() - buffer.find_last_of('.') - 1; - bool leading_zeros = buffer[0] == '0'; - size_t output_width = 0; - - if (range_begin > range_end) //Descending Sequence {20..15} {9..01} - { - std::swap(range_begin,range_end); - leading_zeros = buffer[buffer.find_last_of('.')+1]=='0'; - std::swap(range_begin_width,range_end_width); - } - if (range_begin_width == 1 && leading_zeros) - output_width = 1; ///Special Case: {0..10} {0..999} - else - output_width = std::max(range_begin_width, range_end_width); + if (leading_zeros) + oss_for_replacing << std::setfill('0') << std::setw(static_cast(output_width)); + oss_for_replacing << range_begin; + for (size_t i = range_begin + 1; i <= range_end; ++i) + { + oss_for_replacing << '|'; if (leading_zeros) oss_for_replacing << std::setfill('0') << std::setw(static_cast(output_width)); - oss_for_replacing << range_begin; + oss_for_replacing << i; + } - for (size_t i = range_begin + 1; i <= range_end; ++i) - { - oss_for_replacing << '|'; - if (leading_zeros) - oss_for_replacing << std::setfill('0') << std::setw(static_cast(output_width)); - oss_for_replacing << i; - } - } - else - { - std::replace(buffer.begin(), buffer.end(), ',', '|'); - oss_for_replacing << buffer; - } oss_for_replacing << ")"; current_index = input.data() - escaped_with_globs.data(); } + + while (RE2::FindAndConsume(&input, enum_regex, &matched)) + { + std::string buffer(matched); + + oss_for_replacing << escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1) << '('; + std::replace(buffer.begin(), buffer.end(), ',', '|'); + + oss_for_replacing << buffer; + oss_for_replacing << ")"; + + current_index = input.data() - escaped_with_globs.data(); + } + oss_for_replacing << escaped_with_globs.substr(current_index); std::string almost_res = oss_for_replacing.str(); WriteBufferFromOwnString buf_final_processing; diff --git a/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp b/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp index 654767efc7f..f1e04fc6dfd 100644 --- a/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp +++ b/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp @@ -14,6 +14,7 @@ TEST(Common, makeRegexpPatternFromGlobs) EXPECT_EQ(makeRegexpPatternFromGlobs("/*"), "/[^/]*"); EXPECT_EQ(makeRegexpPatternFromGlobs("{123}"), "(123)"); EXPECT_EQ(makeRegexpPatternFromGlobs("{test}"), "(test)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("{test.tar.gz}"), "(test\\.tar\\.gz)"); EXPECT_EQ(makeRegexpPatternFromGlobs("*_{{a,b,c,d}}/?.csv"), "[^/]*_\\{(a|b|c|d)\\}/[^/]\\.csv"); /* Regex Parsing for {..} can have three possible cases 1) The left range width == the right range width