From 70aa6e267234f39467a72166467a96d9cdc5abfd Mon Sep 17 00:00:00 2001 From: zvonand Date: Thu, 2 Nov 2023 09:58:15 +0100 Subject: [PATCH] Added a shortcut for no-wildcard globs in HDFS --- src/Common/parseGlobs.cpp | 68 +++++++++++++++++++++++ src/Common/parseGlobs.h | 6 +- src/Storages/HDFS/StorageHDFS.cpp | 92 +++++++++---------------------- src/Storages/StorageFile.cpp | 66 +--------------------- 4 files changed, 100 insertions(+), 132 deletions(-) diff --git a/src/Common/parseGlobs.cpp b/src/Common/parseGlobs.cpp index e9460c03d0a..07f78730591 100644 --- a/src/Common/parseGlobs.cpp +++ b/src/Common/parseGlobs.cpp @@ -17,6 +17,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; +} + /* Transforms string from grep-wildcard-syntax ("{N..M}", "{a,b,c}" as in remote table function and "*", "?") to perl-regexp for using re2 library for matching * with such steps: * 1) search intervals like {0..9} and enums like {abc,xyz,qwe} in {}, replace them by regexp with pipe (expr1|expr2|expr3), @@ -116,4 +121,67 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob } return buf_final_processing.str(); } + +void expandSelector(const std::string & path, Strings & for_match_paths_expanded) +{ + /// regexp for {expr1,expr2,expr3}, expr.. should be without "{", "}", "*" and "," + static const re2::RE2 selector_regex(R"({([^{}*,]+,[^{}*]*[^{}*,])})"); + + std::string_view path_view(path); + std::string_view matched; + + if (RE2::FindAndConsume(&path_view, selector_regex, &matched)) + std::string buffer(matched); + else + { + for_match_paths_expanded.push_back(path); + return; + } + + Strings expanded_paths; + + std::vector anchor_positions = {}; + bool opened = false, closed = false; + + for (std::string::const_iterator it = path.begin(); it != path.end(); it++) + { + if (*it == '{') + { + if (opened) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unexpected '{{' found in path '{}' at position {}.", path, std::distance(path.begin(), it)); + anchor_positions.push_back(std::distance(path.begin(), it)); + opened = true; + } + else if (*it == '}') + { + if (!opened) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unexpected '}}' found in path '{}' at position {}.", path, std::distance(path.begin(), it)); + anchor_positions.push_back(std::distance(path.begin(), it)); + closed = true; + break; + } + else if (*it == ',') + { + if (!opened) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Unexpected ',' found in path '{}' at position {}.", path, std::distance(path.begin(), it)); + anchor_positions.push_back(std::distance(path.begin(), it)); + } + } + if (!opened || !closed) + throw Exception(ErrorCodes::BAD_ARGUMENTS, + "Invalid {{}} glob in path {}.", path); + + std::string common_prefix = path.substr(0, anchor_positions[0]); + std::string common_suffix = path.substr(anchor_positions[anchor_positions.size()-1] + 1); + for (size_t i = 1; i < anchor_positions.size(); ++i) + { + std::string expanded_matcher = common_prefix + + path.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1)) + + common_suffix; + expandSelector(expanded_matcher, for_match_paths_expanded); + } +} } diff --git a/src/Common/parseGlobs.h b/src/Common/parseGlobs.h index 043a87884cf..1397d84c8a4 100644 --- a/src/Common/parseGlobs.h +++ b/src/Common/parseGlobs.h @@ -6,5 +6,9 @@ namespace DB { /* Parse globs in string and make a regexp for it. */ -std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_globs); + std::string makeRegexpPatternFromGlobs(const std::string &initial_str_with_globs); + + +/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead. + void expandSelector(const std::string &path, std::vector &for_match_paths_expanded); } diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index d827353ad8e..55aad03b7f7 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -75,59 +75,6 @@ namespace ErrorCodes } namespace { - /// Forward-declare to use in expandSelector() - std::vector LSWithRegexpMatching(const String & path_for_ls, - const HDFSFSPtr & fs, - const String & for_match); - - /// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead. - std::vector expandSelector(const String & path_for_ls, - const HDFSFSPtr & fs, - const String & for_match) - { - std::vector anchor_positions = {}; - bool opened = false, closed = false; - - for (std::string::const_iterator it = for_match.begin(); it != for_match.end(); it++) - { - if (*it == '{') - { - anchor_positions.push_back(std::distance(for_match.begin(), it)); - opened = true; - } - else if (*it == '}') - { - anchor_positions.push_back(std::distance(for_match.begin(), it)); - closed = true; - break; - } - else if (*it == ',') - { - if (!opened) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Unexpected ''' found in path '{}' at position {}.", for_match, std::distance(for_match.begin(), it)); - anchor_positions.push_back(std::distance(for_match.begin(), it)); - } - } - if (!opened || !closed) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Invalid {{}} glob in path {}.", for_match); - - std::vector ret = {}; - - std::string common_prefix = for_match.substr(0, anchor_positions[0]); - std::string common_suffix = for_match.substr(anchor_positions[anchor_positions.size()-1] + 1); - for (size_t i = 1; i < anchor_positions.size(); ++i) - { - std::string expanded_matcher = common_prefix - + for_match.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1)) - + common_suffix; - std::vector result_part = LSWithRegexpMatching(path_for_ls, fs, expanded_matcher); - ret.insert(ret.end(), result_part.begin(), result_part.end()); - } - return ret; - } - /* Recursive directory listing with matched paths as a result. * Have the same method in StorageFile. */ @@ -136,20 +83,24 @@ namespace const HDFSFSPtr & fs, const String & for_match) { - /// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and "," - static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})"); - - std::string_view for_match_view(for_match); - std::string_view matched; - if (RE2::FindAndConsume(&for_match_view, enum_or_range, &matched)) - { - std::string buffer(matched); - if (buffer.find(',') != std::string::npos) - return expandSelector(path_for_ls, fs, for_match); - } + std::vector result; const size_t first_glob_pos = for_match.find_first_of("*?{"); + if (first_glob_pos == std::string::npos) + { + const String path = fs::path(path_for_ls + for_match.substr(1)).lexically_normal(); + HDFSFileInfo ls; + ls.file_info = hdfsGetPathInfo(fs.get(), path.c_str()); + if (ls.file_info != nullptr) // NOLINT + { + result.push_back(StorageHDFS::PathWithInfo{ + String(path), + StorageHDFS::PathInfo{ls.file_info->mLastMod, static_cast(ls.file_info->mSize)}}); + } + return result; + } + const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/'); const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' @@ -171,7 +122,7 @@ namespace throw Exception( ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", prefix_without_globs, String(hdfsGetLastError())); } - std::vector result; + if (!ls.file_info && ls.length > 0) throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null"); for (int i = 0; i < ls.length; ++i) @@ -222,7 +173,16 @@ namespace HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); HDFSFSPtr fs = createHDFSFS(builder.get()); - auto res = LSWithRegexpMatching("/", fs, path_from_uri); + Strings paths; + expandSelector(path_from_uri, paths); + + std::vector res; + + for (const auto & path : paths) + { + auto part_of_res = LSWithRegexpMatching("/", fs, path); + res.insert(res.end(), part_of_res.begin(), part_of_res.end()); + } return res; } diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index fcac7673b16..4c982473255 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -106,70 +106,6 @@ namespace ErrorCodes namespace { -/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead. -void expandSelector(const std::string & path, Strings & for_match_paths_expanded) -{ - /// regexp for {expr1,expr2,expr3}, expr.. should be without "{", "}", "*" and "," - static const re2::RE2 selector_regex(R"({([^{}*,]+,[^{}*]*[^{}*,])})"); - - std::string_view path_view(path); - std::string_view matched; - - if (RE2::FindAndConsume(&path_view, selector_regex, &matched)) - std::string buffer(matched); - else - { - for_match_paths_expanded.push_back(path); - return; - } - - Strings expanded_paths; - - std::vector anchor_positions = {}; - bool opened = false, closed = false; - - for (std::string::const_iterator it = path.begin(); it != path.end(); it++) - { - if (*it == '{') - { - if (opened) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Unexpected '{{' found in path '{}' at position {}.", path, std::distance(path.begin(), it)); - anchor_positions.push_back(std::distance(path.begin(), it)); - opened = true; - } - else if (*it == '}') - { - if (!opened) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Unexpected '}}' found in path '{}' at position {}.", path, std::distance(path.begin(), it)); - anchor_positions.push_back(std::distance(path.begin(), it)); - closed = true; - break; - } - else if (*it == ',') - { - if (!opened) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Unexpected ',' found in path '{}' at position {}.", path, std::distance(path.begin(), it)); - anchor_positions.push_back(std::distance(path.begin(), it)); - } - } - if (!opened || !closed) - throw Exception(ErrorCodes::BAD_ARGUMENTS, - "Invalid {{}} glob in path {}.", path); - - std::string common_prefix = path.substr(0, anchor_positions[0]); - std::string common_suffix = path.substr(anchor_positions[anchor_positions.size()-1] + 1); - for (size_t i = 1; i < anchor_positions.size(); ++i) - { - std::string expanded_matcher = common_prefix - + path.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1)) - + common_suffix; - expandSelector(expanded_matcher, for_match_paths_expanded); - } -} - /* Recursive directory listing with matched paths as a result. * Have the same method in StorageHDFS. */ @@ -189,7 +125,7 @@ void listFilesWithRegexpMatchingImpl( fs::path path = fs::canonical(path_for_ls + for_match); result.push_back(path.string()); } - catch (const std::exception &) + catch (const std::exception &) // NOLINT { /// There is no such file, but we just ignore this. // throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File {} doesn't exist", for_match);