Added a shortcut for no-wildcard globs in HDFS

This commit is contained in:
zvonand 2023-11-02 09:58:15 +01:00
parent 3e47a54a94
commit 70aa6e2672
4 changed files with 100 additions and 132 deletions

View File

@ -17,6 +17,11 @@
namespace DB
{
namespace ErrorCodes
{
extern const int BAD_ARGUMENTS;
}
/* Transforms string from grep-wildcard-syntax ("{N..M}", "{a,b,c}" as in remote table function and "*", "?") to perl-regexp for using re2 library for matching
* with such steps:
* 1) search intervals like {0..9} and enums like {abc,xyz,qwe} in {}, replace them by regexp with pipe (expr1|expr2|expr3),
@ -116,4 +121,67 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
}
return buf_final_processing.str();
}
void expandSelector(const std::string & path, Strings & for_match_paths_expanded)
{
/// regexp for {expr1,expr2,expr3}, expr.. should be without "{", "}", "*" and ","
static const re2::RE2 selector_regex(R"({([^{}*,]+,[^{}*]*[^{}*,])})");
std::string_view path_view(path);
std::string_view matched;
if (RE2::FindAndConsume(&path_view, selector_regex, &matched))
std::string buffer(matched);
else
{
for_match_paths_expanded.push_back(path);
return;
}
Strings expanded_paths;
std::vector<size_t> anchor_positions = {};
bool opened = false, closed = false;
for (std::string::const_iterator it = path.begin(); it != path.end(); it++)
{
if (*it == '{')
{
if (opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected '{{' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
anchor_positions.push_back(std::distance(path.begin(), it));
opened = true;
}
else if (*it == '}')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected '}}' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
anchor_positions.push_back(std::distance(path.begin(), it));
closed = true;
break;
}
else if (*it == ',')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected ',' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
anchor_positions.push_back(std::distance(path.begin(), it));
}
}
if (!opened || !closed)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Invalid {{}} glob in path {}.", path);
std::string common_prefix = path.substr(0, anchor_positions[0]);
std::string common_suffix = path.substr(anchor_positions[anchor_positions.size()-1] + 1);
for (size_t i = 1; i < anchor_positions.size(); ++i)
{
std::string expanded_matcher = common_prefix
+ path.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
+ common_suffix;
expandSelector(expanded_matcher, for_match_paths_expanded);
}
}
}

View File

@ -7,4 +7,8 @@ namespace DB
/* Parse globs in string and make a regexp for it.
*/
std::string makeRegexpPatternFromGlobs(const std::string &initial_str_with_globs);
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
void expandSelector(const std::string &path, std::vector<std::string> &for_match_paths_expanded);
}

View File

@ -75,59 +75,6 @@ namespace ErrorCodes
}
namespace
{
/// Forward-declare to use in expandSelector()
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & for_match);
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
std::vector<StorageHDFS::PathWithInfo> expandSelector(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & for_match)
{
std::vector<size_t> anchor_positions = {};
bool opened = false, closed = false;
for (std::string::const_iterator it = for_match.begin(); it != for_match.end(); it++)
{
if (*it == '{')
{
anchor_positions.push_back(std::distance(for_match.begin(), it));
opened = true;
}
else if (*it == '}')
{
anchor_positions.push_back(std::distance(for_match.begin(), it));
closed = true;
break;
}
else if (*it == ',')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected ''' found in path '{}' at position {}.", for_match, std::distance(for_match.begin(), it));
anchor_positions.push_back(std::distance(for_match.begin(), it));
}
}
if (!opened || !closed)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Invalid {{}} glob in path {}.", for_match);
std::vector<StorageHDFS::PathWithInfo> ret = {};
std::string common_prefix = for_match.substr(0, anchor_positions[0]);
std::string common_suffix = for_match.substr(anchor_positions[anchor_positions.size()-1] + 1);
for (size_t i = 1; i < anchor_positions.size(); ++i)
{
std::string expanded_matcher = common_prefix
+ for_match.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
+ common_suffix;
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(path_for_ls, fs, expanded_matcher);
ret.insert(ret.end(), result_part.begin(), result_part.end());
}
return ret;
}
/* Recursive directory listing with matched paths as a result.
* Have the same method in StorageFile.
*/
@ -136,20 +83,24 @@ namespace
const HDFSFSPtr & fs,
const String & for_match)
{
/// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and ","
static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})");
std::string_view for_match_view(for_match);
std::string_view matched;
if (RE2::FindAndConsume(&for_match_view, enum_or_range, &matched))
{
std::string buffer(matched);
if (buffer.find(',') != std::string::npos)
return expandSelector(path_for_ls, fs, for_match);
}
std::vector<StorageHDFS::PathWithInfo> result;
const size_t first_glob_pos = for_match.find_first_of("*?{");
if (first_glob_pos == std::string::npos)
{
const String path = fs::path(path_for_ls + for_match.substr(1)).lexically_normal();
HDFSFileInfo ls;
ls.file_info = hdfsGetPathInfo(fs.get(), path.c_str());
if (ls.file_info != nullptr) // NOLINT
{
result.push_back(StorageHDFS::PathWithInfo{
String(path),
StorageHDFS::PathInfo{ls.file_info->mLastMod, static_cast<size_t>(ls.file_info->mSize)}});
}
return result;
}
const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/');
const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/'
const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/'
@ -171,7 +122,7 @@ namespace
throw Exception(
ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", prefix_without_globs, String(hdfsGetLastError()));
}
std::vector<StorageHDFS::PathWithInfo> result;
if (!ls.file_info && ls.length > 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null");
for (int i = 0; i < ls.length; ++i)
@ -222,7 +173,16 @@ namespace
HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef());
HDFSFSPtr fs = createHDFSFS(builder.get());
auto res = LSWithRegexpMatching("/", fs, path_from_uri);
Strings paths;
expandSelector(path_from_uri, paths);
std::vector<StorageHDFS::PathWithInfo> res;
for (const auto & path : paths)
{
auto part_of_res = LSWithRegexpMatching("/", fs, path);
res.insert(res.end(), part_of_res.begin(), part_of_res.end());
}
return res;
}

View File

@ -106,70 +106,6 @@ namespace ErrorCodes
namespace
{
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
void expandSelector(const std::string & path, Strings & for_match_paths_expanded)
{
/// regexp for {expr1,expr2,expr3}, expr.. should be without "{", "}", "*" and ","
static const re2::RE2 selector_regex(R"({([^{}*,]+,[^{}*]*[^{}*,])})");
std::string_view path_view(path);
std::string_view matched;
if (RE2::FindAndConsume(&path_view, selector_regex, &matched))
std::string buffer(matched);
else
{
for_match_paths_expanded.push_back(path);
return;
}
Strings expanded_paths;
std::vector<size_t> anchor_positions = {};
bool opened = false, closed = false;
for (std::string::const_iterator it = path.begin(); it != path.end(); it++)
{
if (*it == '{')
{
if (opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected '{{' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
anchor_positions.push_back(std::distance(path.begin(), it));
opened = true;
}
else if (*it == '}')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected '}}' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
anchor_positions.push_back(std::distance(path.begin(), it));
closed = true;
break;
}
else if (*it == ',')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected ',' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
anchor_positions.push_back(std::distance(path.begin(), it));
}
}
if (!opened || !closed)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Invalid {{}} glob in path {}.", path);
std::string common_prefix = path.substr(0, anchor_positions[0]);
std::string common_suffix = path.substr(anchor_positions[anchor_positions.size()-1] + 1);
for (size_t i = 1; i < anchor_positions.size(); ++i)
{
std::string expanded_matcher = common_prefix
+ path.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
+ common_suffix;
expandSelector(expanded_matcher, for_match_paths_expanded);
}
}
/* Recursive directory listing with matched paths as a result.
* Have the same method in StorageHDFS.
*/
@ -189,7 +125,7 @@ void listFilesWithRegexpMatchingImpl(
fs::path path = fs::canonical(path_for_ls + for_match);
result.push_back(path.string());
}
catch (const std::exception &)
catch (const std::exception &) // NOLINT
{
/// There is no such file, but we just ignore this.
// throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File {} doesn't exist", for_match);