ClickHouse/src/Common/parseGlobs.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

114 lines
4.5 KiB
C++
Raw Normal View History

2019-07-21 13:15:04 +00:00
#include <Common/parseGlobs.h>
2020-11-09 19:07:38 +00:00
#include <IO/WriteBufferFromString.h>
#include <IO/ReadBufferFromString.h>
#include <IO/Operators.h>
2019-07-21 13:15:04 +00:00
#include <re2/re2.h>
#include <re2/stringpiece.h>
#include <algorithm>
#include <sstream>
#include <iomanip>
2020-03-20 02:15:28 +00:00
2019-07-21 13:15:04 +00:00
namespace DB
{
2020-08-08 01:01:47 +00:00
/* Transforms string from grep-wildcard-syntax ("{N..M}", "{a,b,c}" as in remote table function and "*", "?") to perl-regexp for using re2 library for matching
2019-08-27 15:20:31 +00:00
* with such steps:
* 1) search intervals like {0..9} and enums like {abc,xyz,qwe} in {}, replace them by regexp with pipe (expr1|expr2|expr3),
2019-08-30 13:27:05 +00:00
* 2) search and replace "*" and "?".
2019-08-27 15:20:31 +00:00
* Before each search need to escape symbols that we would not search.
2019-08-29 15:38:33 +00:00
*
* There are few examples in unit tests.
*/
2019-08-27 15:20:31 +00:00
std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_globs)
2019-07-21 13:15:04 +00:00
{
2020-11-09 19:07:38 +00:00
/// FIXME make it better
WriteBufferFromOwnString buf_for_escaping;
2019-08-27 15:20:31 +00:00
/// Escaping only characters that not used in glob syntax
for (const auto & letter : initial_str_with_globs)
2019-07-21 13:15:04 +00:00
{
2020-11-09 19:07:38 +00:00
if ((letter == '[') || (letter == ']') || (letter == '|') || (letter == '+') || (letter == '-') || (letter == '(') || (letter == ')') || (letter == '\\'))
buf_for_escaping << '\\';
buf_for_escaping << letter;
2019-07-21 13:15:04 +00:00
}
2020-11-09 19:07:38 +00:00
std::string escaped_with_globs = buf_for_escaping.str();
2019-09-04 19:55:56 +00:00
2022-07-06 04:18:39 +00:00
static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and ","
2019-08-27 15:20:31 +00:00
re2::StringPiece input(escaped_with_globs);
re2::StringPiece matched;
2020-11-09 19:07:38 +00:00
std::ostringstream oss_for_replacing; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
2020-11-07 00:14:53 +00:00
oss_for_replacing.exceptions(std::ios::failbit);
2019-07-21 13:15:04 +00:00
size_t current_index = 0;
2019-08-27 15:20:31 +00:00
while (RE2::FindAndConsume(&input, enum_or_range, &matched))
2019-07-21 13:15:04 +00:00
{
2019-08-27 15:20:31 +00:00
std::string buffer = matched.ToString();
2019-09-04 19:55:56 +00:00
oss_for_replacing << escaped_with_globs.substr(current_index, matched.data() - escaped_with_globs.data() - current_index - 1) << '(';
2019-08-27 15:20:31 +00:00
if (buffer.find(',') == std::string::npos)
{
2020-03-20 02:15:28 +00:00
size_t range_begin = 0;
size_t range_end = 0;
char point;
2020-11-09 19:07:38 +00:00
ReadBufferFromString buf_range(buffer);
buf_range >> range_begin >> point >> point >> range_end;
2022-06-28 19:49:31 +00:00
size_t range_begin_width = buffer.find('.');
size_t range_end_width = buffer.size() - buffer.find_last_of('.') - 1;
2022-07-06 04:18:39 +00:00
bool leading_zeros = buffer[0] == '0';
size_t output_width = 0;
if (range_begin > range_end) //Descending Sequence {20..15} {9..01}
{
std::swap(range_begin,range_end);
leading_zeros = buffer[buffer.find_last_of('.')+1]=='0';
std::swap(range_begin_width,range_end_width);
}
2022-06-28 19:49:31 +00:00
if (range_begin_width == 1 && leading_zeros)
2022-07-06 04:18:39 +00:00
output_width = 1; ///Special Case: {0..10} {0..999}
2022-06-28 19:49:31 +00:00
else
2022-07-06 04:18:39 +00:00
output_width = std::max(range_begin_width, range_end_width);
if (leading_zeros)
oss_for_replacing << std::setfill('0') << std::setw(static_cast<int>(output_width));
2019-09-04 19:55:56 +00:00
oss_for_replacing << range_begin;
2022-07-06 04:18:39 +00:00
2019-08-27 15:20:31 +00:00
for (size_t i = range_begin + 1; i <= range_end; ++i)
{
oss_for_replacing << '|';
if (leading_zeros)
oss_for_replacing << std::setfill('0') << std::setw(static_cast<int>(output_width));
oss_for_replacing << i;
2019-08-27 15:20:31 +00:00
}
}
else
{
std::replace(buffer.begin(), buffer.end(), ',', '|');
2019-09-04 19:55:56 +00:00
oss_for_replacing << buffer;
2019-08-27 15:20:31 +00:00
}
2019-09-04 19:55:56 +00:00
oss_for_replacing << ")";
2019-08-27 15:20:31 +00:00
current_index = input.data() - escaped_with_globs.data();
2019-07-21 13:15:04 +00:00
}
2019-09-04 19:55:56 +00:00
oss_for_replacing << escaped_with_globs.substr(current_index);
std::string almost_res = oss_for_replacing.str();
2020-11-09 19:07:38 +00:00
WriteBufferFromOwnString buf_final_processing;
char previous = ' ';
for (const auto & letter : almost_res)
2019-07-21 13:15:04 +00:00
{
if (previous == '*' && letter == '*')
{
buf_final_processing << "[^{}]";
}
else if ((letter == '?') || (letter == '*'))
2019-07-21 13:15:04 +00:00
{
2020-11-09 19:07:38 +00:00
buf_final_processing << "[^/]"; /// '?' is any symbol except '/'
2019-07-21 13:15:04 +00:00
if (letter == '?')
continue;
}
else if ((letter == '.') || (letter == '{') || (letter == '}'))
2020-11-09 19:07:38 +00:00
buf_final_processing << '\\';
buf_final_processing << letter;
previous = letter;
2019-07-21 13:15:04 +00:00
}
2020-11-09 19:07:38 +00:00
return buf_final_processing.str();
2019-07-21 13:15:04 +00:00
}
}