From d89ba2e5d98600e1bf682b987339f132b8a6b6cb Mon Sep 17 00:00:00 2001 From: HeenaBansal2009 Date: Tue, 5 Jul 2022 21:18:39 -0700 Subject: [PATCH] Review Comments --- src/Common/parseGlobs.cpp | 25 +++++++----- .../gtest_makeRegexpPatternFromGlobs.cpp | 27 +++++++++++-- .../02297_regex_parsing_file_names.reference | 1 + .../02297_regex_parsing_file_names.sh | 38 +++++++++++++++++++ 4 files changed, 78 insertions(+), 13 deletions(-) create mode 100644 tests/queries/0_stateless/02297_regex_parsing_file_names.reference create mode 100755 tests/queries/0_stateless/02297_regex_parsing_file_names.sh diff --git a/src/Common/parseGlobs.cpp b/src/Common/parseGlobs.cpp index 9e18a9c0780..8e9195f9842 100644 --- a/src/Common/parseGlobs.cpp +++ b/src/Common/parseGlobs.cpp @@ -32,7 +32,7 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob } std::string escaped_with_globs = buf_for_escaping.str(); - static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without {}*, + static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and "," re2::StringPiece input(escaped_with_globs); re2::StringPiece matched; std::ostringstream oss_for_replacing; // STYLE_CHECK_ALLOW_STD_STRING_STREAM @@ -50,25 +50,32 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob char point; ReadBufferFromString buf_range(buffer); buf_range >> range_begin >> point >> point >> range_end; - bool leading_zeros = buffer[0] == '0'; size_t range_begin_width = buffer.find('.'); size_t range_end_width = buffer.size() - buffer.find_last_of('.') - 1; - //Scenarios {0..10} {0..999} - size_t num_len = 0; + bool leading_zeros = buffer[0] == '0'; + size_t output_width = 0; + + if (range_begin > range_end) //Descending Sequence {20..15} {9..01} + { + std::swap(range_begin,range_end); + leading_zeros = buffer[buffer.find_last_of('.')+1]=='0'; + std::swap(range_begin_width,range_end_width); + } if (range_begin_width == 1 && leading_zeros) - num_len = 1; - //Scenarios {00..99} {00..099} + output_width = 1; ///Special Case: {0..10} {0..999} else - num_len = range_begin_width < range_end_width ? range_end_width : range_begin_width; + output_width = std::max(range_begin_width, range_end_width); + if (leading_zeros) - oss_for_replacing << std::setfill('0') << std::setw(num_len); + oss_for_replacing << std::setfill('0') << std::setw(output_width); oss_for_replacing << range_begin; + for (size_t i = range_begin + 1; i <= range_end; ++i) { oss_for_replacing << '|'; if (leading_zeros) - oss_for_replacing << std::setfill('0') << std::setw(num_len); + oss_for_replacing << std::setfill('0') << std::setw(output_width); oss_for_replacing << i; } } diff --git a/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp b/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp index 833a0a6ba73..fda3a6ee1c8 100644 --- a/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp +++ b/src/Common/tests/gtest_makeRegexpPatternFromGlobs.cpp @@ -8,21 +8,40 @@ using namespace DB; TEST(Common, makeRegexpPatternFromGlobs) { + EXPECT_EQ(makeRegexpPatternFromGlobs("?"), "[^/]"); EXPECT_EQ(makeRegexpPatternFromGlobs("*"), "[^/]*"); EXPECT_EQ(makeRegexpPatternFromGlobs("/?"), "/[^/]"); EXPECT_EQ(makeRegexpPatternFromGlobs("/*"), "/[^/]*"); EXPECT_EQ(makeRegexpPatternFromGlobs("*_{{a,b,c,d}}/?.csv"), "[^/]*_\\{(a|b|c|d)\\}/[^/]\\.csv"); - EXPECT_EQ(makeRegexpPatternFromGlobs("f{01..9}"), "f(01|02|03|04|05|06|07|08|09)"); - EXPECT_EQ(makeRegexpPatternFromGlobs("f{001..0009}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)"); + /* Regex Parsing for {..} can have three possible cases + 1) The left range width == the right range width + 2) The left range width > the right range width + 3) The left range width < the right range width + */ + // Ascending Sequences + EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..9}"), "f(1|2|3|4|5|6|7|8|9)"); EXPECT_EQ(makeRegexpPatternFromGlobs("f{0..10}"), "f(0|1|2|3|4|5|6|7|8|9|10)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{10..20}"), "f(10|11|12|13|14|15|16|17|18|19|20)"); EXPECT_EQ(makeRegexpPatternFromGlobs("f{00..10}"), "f(00|01|02|03|04|05|06|07|08|09|10)"); - EXPECT_EQ(makeRegexpPatternFromGlobs("f{000..9}"), "f(000|001|002|003|004|005|006|007|008|009)"); EXPECT_EQ(makeRegexpPatternFromGlobs("f{0001..0009}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{01..9}"), "f(01|02|03|04|05|06|07|08|09)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{000..9}"), "f(000|001|002|003|004|005|006|007|008|009)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{95..103}"), "f(95|96|97|98|99|100|101|102|103)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{99..109}"), "f(99|100|101|102|103|104|105|106|107|108|109)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{001..0009}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)"); + // Descending Sequences + EXPECT_EQ(makeRegexpPatternFromGlobs("f{20..15}"), "f(15|16|17|18|19|20)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{200..199}"), "f(199|200)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{0009..0001}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{100..90}"), "f(90|91|92|93|94|95|96|97|98|99|100)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{103..95}"), "f(95|96|97|98|99|100|101|102|103)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{9..01}"), "f(01|02|03|04|05|06|07|08|09)"); + EXPECT_EQ(makeRegexpPatternFromGlobs("f{9..000}"), "f(000|001|002|003|004|005|006|007|008|009)"); EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..2}{1..2}"), "f(1|2)(1|2)"); EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..1}{1..1}"), "f(1)(1)"); EXPECT_EQ(makeRegexpPatternFromGlobs("f{0..0}{0..0}"), "f(0)(0)"); EXPECT_EQ(makeRegexpPatternFromGlobs("file{1..5}"),"file(1|2|3|4|5)"); EXPECT_EQ(makeRegexpPatternFromGlobs("file{1,2,3}"),"file(1|2|3)"); EXPECT_EQ(makeRegexpPatternFromGlobs("{1,2,3}blabla{a.x,b.x,c.x}smth[]_else{aa,bb}?*"), "(1|2|3)blabla(a\\.x|b\\.x|c\\.x)smth\\[\\]_else(aa|bb)[^/][^/]*"); -} \ No newline at end of file +} diff --git a/tests/queries/0_stateless/02297_regex_parsing_file_names.reference b/tests/queries/0_stateless/02297_regex_parsing_file_names.reference new file mode 100644 index 00000000000..b4de3947675 --- /dev/null +++ b/tests/queries/0_stateless/02297_regex_parsing_file_names.reference @@ -0,0 +1 @@ +11 diff --git a/tests/queries/0_stateless/02297_regex_parsing_file_names.sh b/tests/queries/0_stateless/02297_regex_parsing_file_names.sh new file mode 100755 index 00000000000..2db4ae8044c --- /dev/null +++ b/tests/queries/0_stateless/02297_regex_parsing_file_names.sh @@ -0,0 +1,38 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Data preparation. + +# Now we can get the user_files_path by use the table file function for trick. also we can get it by query as: +# "insert into function file('exist.txt', 'CSV', 'val1 char') values ('aaaa'); select _path from file('exist.txt', 'CSV', 'val1 char')" +CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') + +mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/ + +echo '{"obj": "aaa", "id": 1, "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_0.json +echo '{"id": 2, "obj": "bbb", "s": "bar"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_1.json +echo '{"id": 3, "obj": "ccc", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_2.json +echo '{"id": 4, "obj": "ddd", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_3.json +echo '{"id": 5, "obj": "eee", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_4.json +echo '{"id": 6, "obj": "fff", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_5.json +echo '{"id": 7, "obj": "ggg", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_6.json +echo '{"id": 8, "obj": "hhh", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_7.json +echo '{"id": 9, "obj": "iii", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_8.json +echo '{"id": 10, "obj":"jjj", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_9.json +echo '{"id": 11, "obj": "kkk", "s": "foo"}'>> ${CLICKHOUSE_USER_FILES_PATH}/file_10.json + + +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_regex" + +${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_regex (id UInt64, obj String, s String) ENGINE =File(JSONEachRow)" ; + + +${CLICKHOUSE_CLIENT} -q "INSERT INTO t_regex SELECT * FROM file('file_{0..10}.json','JSONEachRow')"; +${CLICKHOUSE_CLIENT} -q "SELECT count() from t_regex" + +rm -rf ${CLICKHOUSE_USER_FILES_PATH}/file_*.json; +${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_regex"