Review Comments

This commit is contained in:
HeenaBansal2009 2022-07-05 21:18:39 -07:00
parent 608bfb8453
commit d89ba2e5d9
4 changed files with 78 additions and 13 deletions

View File

@ -32,7 +32,7 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
}
std::string escaped_with_globs = buf_for_escaping.str();
static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without {}*,
static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and ","
re2::StringPiece input(escaped_with_globs);
re2::StringPiece matched;
std::ostringstream oss_for_replacing; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
@ -50,25 +50,32 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
char point;
ReadBufferFromString buf_range(buffer);
buf_range >> range_begin >> point >> point >> range_end;
bool leading_zeros = buffer[0] == '0';
size_t range_begin_width = buffer.find('.');
size_t range_end_width = buffer.size() - buffer.find_last_of('.') - 1;
//Scenarios {0..10} {0..999}
size_t num_len = 0;
bool leading_zeros = buffer[0] == '0';
size_t output_width = 0;
if (range_begin > range_end) //Descending Sequence {20..15} {9..01}
{
std::swap(range_begin,range_end);
leading_zeros = buffer[buffer.find_last_of('.')+1]=='0';
std::swap(range_begin_width,range_end_width);
}
if (range_begin_width == 1 && leading_zeros)
num_len = 1;
//Scenarios {00..99} {00..099}
output_width = 1; ///Special Case: {0..10} {0..999}
else
num_len = range_begin_width < range_end_width ? range_end_width : range_begin_width;
output_width = std::max(range_begin_width, range_end_width);
if (leading_zeros)
oss_for_replacing << std::setfill('0') << std::setw(num_len);
oss_for_replacing << std::setfill('0') << std::setw(output_width);
oss_for_replacing << range_begin;
for (size_t i = range_begin + 1; i <= range_end; ++i)
{
oss_for_replacing << '|';
if (leading_zeros)
oss_for_replacing << std::setfill('0') << std::setw(num_len);
oss_for_replacing << std::setfill('0') << std::setw(output_width);
oss_for_replacing << i;
}
}

View File

@ -8,17 +8,36 @@ using namespace DB;
TEST(Common, makeRegexpPatternFromGlobs)
{
EXPECT_EQ(makeRegexpPatternFromGlobs("?"), "[^/]");
EXPECT_EQ(makeRegexpPatternFromGlobs("*"), "[^/]*");
EXPECT_EQ(makeRegexpPatternFromGlobs("/?"), "/[^/]");
EXPECT_EQ(makeRegexpPatternFromGlobs("/*"), "/[^/]*");
EXPECT_EQ(makeRegexpPatternFromGlobs("*_{{a,b,c,d}}/?.csv"), "[^/]*_\\{(a|b|c|d)\\}/[^/]\\.csv");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{01..9}"), "f(01|02|03|04|05|06|07|08|09)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{001..0009}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)");
/* Regex Parsing for {..} can have three possible cases
1) The left range width == the right range width
2) The left range width > the right range width
3) The left range width < the right range width
*/
// Ascending Sequences
EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..9}"), "f(1|2|3|4|5|6|7|8|9)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{0..10}"), "f(0|1|2|3|4|5|6|7|8|9|10)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{10..20}"), "f(10|11|12|13|14|15|16|17|18|19|20)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{00..10}"), "f(00|01|02|03|04|05|06|07|08|09|10)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{000..9}"), "f(000|001|002|003|004|005|006|007|008|009)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{0001..0009}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{01..9}"), "f(01|02|03|04|05|06|07|08|09)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{000..9}"), "f(000|001|002|003|004|005|006|007|008|009)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{95..103}"), "f(95|96|97|98|99|100|101|102|103)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{99..109}"), "f(99|100|101|102|103|104|105|106|107|108|109)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{001..0009}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)");
// Descending Sequences
EXPECT_EQ(makeRegexpPatternFromGlobs("f{20..15}"), "f(15|16|17|18|19|20)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{200..199}"), "f(199|200)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{0009..0001}"), "f(0001|0002|0003|0004|0005|0006|0007|0008|0009)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{100..90}"), "f(90|91|92|93|94|95|96|97|98|99|100)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{103..95}"), "f(95|96|97|98|99|100|101|102|103)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{9..01}"), "f(01|02|03|04|05|06|07|08|09)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{9..000}"), "f(000|001|002|003|004|005|006|007|008|009)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..2}{1..2}"), "f(1|2)(1|2)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{1..1}{1..1}"), "f(1)(1)");
EXPECT_EQ(makeRegexpPatternFromGlobs("f{0..0}{0..0}"), "f(0)(0)");

View File

@ -0,0 +1,38 @@
#!/usr/bin/env bash
# Tags: no-fasttest
CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
# shellcheck source=../shell_config.sh
. "$CURDIR"/../shell_config.sh
# Data preparation.
# Now we can get the user_files_path by use the table file function for trick. also we can get it by query as:
# "insert into function file('exist.txt', 'CSV', 'val1 char') values ('aaaa'); select _path from file('exist.txt', 'CSV', 'val1 char')"
CLICKHOUSE_USER_FILES_PATH=$(clickhouse-client --query "select _path, _file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}')
mkdir -p ${CLICKHOUSE_USER_FILES_PATH}/
echo '{"obj": "aaa", "id": 1, "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_0.json
echo '{"id": 2, "obj": "bbb", "s": "bar"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_1.json
echo '{"id": 3, "obj": "ccc", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_2.json
echo '{"id": 4, "obj": "ddd", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_3.json
echo '{"id": 5, "obj": "eee", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_4.json
echo '{"id": 6, "obj": "fff", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_5.json
echo '{"id": 7, "obj": "ggg", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_6.json
echo '{"id": 8, "obj": "hhh", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_7.json
echo '{"id": 9, "obj": "iii", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_8.json
echo '{"id": 10, "obj":"jjj", "s": "foo"}' >> ${CLICKHOUSE_USER_FILES_PATH}/file_9.json
echo '{"id": 11, "obj": "kkk", "s": "foo"}'>> ${CLICKHOUSE_USER_FILES_PATH}/file_10.json
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_regex"
${CLICKHOUSE_CLIENT} -q "CREATE TABLE t_regex (id UInt64, obj String, s String) ENGINE =File(JSONEachRow)" ;
${CLICKHOUSE_CLIENT} -q "INSERT INTO t_regex SELECT * FROM file('file_{0..10}.json','JSONEachRow')";
${CLICKHOUSE_CLIENT} -q "SELECT count() from t_regex"
rm -rf ${CLICKHOUSE_USER_FILES_PATH}/file_*.json;
${CLICKHOUSE_CLIENT} -q "DROP TABLE IF EXISTS t_regex"