mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-29 11:02:08 +00:00
Added a shortcut for no-wildcard globs in HDFS
This commit is contained in:
parent
3e47a54a94
commit
70aa6e2672
@ -17,6 +17,11 @@
|
||||
|
||||
namespace DB
|
||||
{
|
||||
namespace ErrorCodes
|
||||
{
|
||||
extern const int BAD_ARGUMENTS;
|
||||
}
|
||||
|
||||
/* Transforms string from grep-wildcard-syntax ("{N..M}", "{a,b,c}" as in remote table function and "*", "?") to perl-regexp for using re2 library for matching
|
||||
* with such steps:
|
||||
* 1) search intervals like {0..9} and enums like {abc,xyz,qwe} in {}, replace them by regexp with pipe (expr1|expr2|expr3),
|
||||
@ -116,4 +121,67 @@ std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_glob
|
||||
}
|
||||
return buf_final_processing.str();
|
||||
}
|
||||
|
||||
void expandSelector(const std::string & path, Strings & for_match_paths_expanded)
|
||||
{
|
||||
/// regexp for {expr1,expr2,expr3}, expr.. should be without "{", "}", "*" and ","
|
||||
static const re2::RE2 selector_regex(R"({([^{}*,]+,[^{}*]*[^{}*,])})");
|
||||
|
||||
std::string_view path_view(path);
|
||||
std::string_view matched;
|
||||
|
||||
if (RE2::FindAndConsume(&path_view, selector_regex, &matched))
|
||||
std::string buffer(matched);
|
||||
else
|
||||
{
|
||||
for_match_paths_expanded.push_back(path);
|
||||
return;
|
||||
}
|
||||
|
||||
Strings expanded_paths;
|
||||
|
||||
std::vector<size_t> anchor_positions = {};
|
||||
bool opened = false, closed = false;
|
||||
|
||||
for (std::string::const_iterator it = path.begin(); it != path.end(); it++)
|
||||
{
|
||||
if (*it == '{')
|
||||
{
|
||||
if (opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected '{{' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
|
||||
anchor_positions.push_back(std::distance(path.begin(), it));
|
||||
opened = true;
|
||||
}
|
||||
else if (*it == '}')
|
||||
{
|
||||
if (!opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected '}}' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
|
||||
anchor_positions.push_back(std::distance(path.begin(), it));
|
||||
closed = true;
|
||||
break;
|
||||
}
|
||||
else if (*it == ',')
|
||||
{
|
||||
if (!opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected ',' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
|
||||
anchor_positions.push_back(std::distance(path.begin(), it));
|
||||
}
|
||||
}
|
||||
if (!opened || !closed)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Invalid {{}} glob in path {}.", path);
|
||||
|
||||
std::string common_prefix = path.substr(0, anchor_positions[0]);
|
||||
std::string common_suffix = path.substr(anchor_positions[anchor_positions.size()-1] + 1);
|
||||
for (size_t i = 1; i < anchor_positions.size(); ++i)
|
||||
{
|
||||
std::string expanded_matcher = common_prefix
|
||||
+ path.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
|
||||
+ common_suffix;
|
||||
expandSelector(expanded_matcher, for_match_paths_expanded);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -6,5 +6,9 @@ namespace DB
|
||||
{
|
||||
/* Parse globs in string and make a regexp for it.
|
||||
*/
|
||||
std::string makeRegexpPatternFromGlobs(const std::string & initial_str_with_globs);
|
||||
std::string makeRegexpPatternFromGlobs(const std::string &initial_str_with_globs);
|
||||
|
||||
|
||||
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
|
||||
void expandSelector(const std::string &path, std::vector<std::string> &for_match_paths_expanded);
|
||||
}
|
||||
|
@ -75,59 +75,6 @@ namespace ErrorCodes
|
||||
}
|
||||
namespace
|
||||
{
|
||||
/// Forward-declare to use in expandSelector()
|
||||
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls,
|
||||
const HDFSFSPtr & fs,
|
||||
const String & for_match);
|
||||
|
||||
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
|
||||
std::vector<StorageHDFS::PathWithInfo> expandSelector(const String & path_for_ls,
|
||||
const HDFSFSPtr & fs,
|
||||
const String & for_match)
|
||||
{
|
||||
std::vector<size_t> anchor_positions = {};
|
||||
bool opened = false, closed = false;
|
||||
|
||||
for (std::string::const_iterator it = for_match.begin(); it != for_match.end(); it++)
|
||||
{
|
||||
if (*it == '{')
|
||||
{
|
||||
anchor_positions.push_back(std::distance(for_match.begin(), it));
|
||||
opened = true;
|
||||
}
|
||||
else if (*it == '}')
|
||||
{
|
||||
anchor_positions.push_back(std::distance(for_match.begin(), it));
|
||||
closed = true;
|
||||
break;
|
||||
}
|
||||
else if (*it == ',')
|
||||
{
|
||||
if (!opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected ''' found in path '{}' at position {}.", for_match, std::distance(for_match.begin(), it));
|
||||
anchor_positions.push_back(std::distance(for_match.begin(), it));
|
||||
}
|
||||
}
|
||||
if (!opened || !closed)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Invalid {{}} glob in path {}.", for_match);
|
||||
|
||||
std::vector<StorageHDFS::PathWithInfo> ret = {};
|
||||
|
||||
std::string common_prefix = for_match.substr(0, anchor_positions[0]);
|
||||
std::string common_suffix = for_match.substr(anchor_positions[anchor_positions.size()-1] + 1);
|
||||
for (size_t i = 1; i < anchor_positions.size(); ++i)
|
||||
{
|
||||
std::string expanded_matcher = common_prefix
|
||||
+ for_match.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
|
||||
+ common_suffix;
|
||||
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(path_for_ls, fs, expanded_matcher);
|
||||
ret.insert(ret.end(), result_part.begin(), result_part.end());
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Recursive directory listing with matched paths as a result.
|
||||
* Have the same method in StorageFile.
|
||||
*/
|
||||
@ -136,20 +83,24 @@ namespace
|
||||
const HDFSFSPtr & fs,
|
||||
const String & for_match)
|
||||
{
|
||||
/// regexp for {expr1,expr2,expr3} or {M..N}, where M and N - non-negative integers, expr's should be without "{", "}", "*" and ","
|
||||
static const re2::RE2 enum_or_range(R"({([\d]+\.\.[\d]+|[^{}*,]+,[^{}*]*[^{}*,])})");
|
||||
|
||||
std::string_view for_match_view(for_match);
|
||||
std::string_view matched;
|
||||
if (RE2::FindAndConsume(&for_match_view, enum_or_range, &matched))
|
||||
{
|
||||
std::string buffer(matched);
|
||||
if (buffer.find(',') != std::string::npos)
|
||||
return expandSelector(path_for_ls, fs, for_match);
|
||||
}
|
||||
std::vector<StorageHDFS::PathWithInfo> result;
|
||||
|
||||
const size_t first_glob_pos = for_match.find_first_of("*?{");
|
||||
|
||||
if (first_glob_pos == std::string::npos)
|
||||
{
|
||||
const String path = fs::path(path_for_ls + for_match.substr(1)).lexically_normal();
|
||||
HDFSFileInfo ls;
|
||||
ls.file_info = hdfsGetPathInfo(fs.get(), path.c_str());
|
||||
if (ls.file_info != nullptr) // NOLINT
|
||||
{
|
||||
result.push_back(StorageHDFS::PathWithInfo{
|
||||
String(path),
|
||||
StorageHDFS::PathInfo{ls.file_info->mLastMod, static_cast<size_t>(ls.file_info->mSize)}});
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
const size_t end_of_path_without_globs = for_match.substr(0, first_glob_pos).rfind('/');
|
||||
const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/'
|
||||
const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/'
|
||||
@ -171,7 +122,7 @@ namespace
|
||||
throw Exception(
|
||||
ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", prefix_without_globs, String(hdfsGetLastError()));
|
||||
}
|
||||
std::vector<StorageHDFS::PathWithInfo> result;
|
||||
|
||||
if (!ls.file_info && ls.length > 0)
|
||||
throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null");
|
||||
for (int i = 0; i < ls.length; ++i)
|
||||
@ -222,7 +173,16 @@ namespace
|
||||
HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef());
|
||||
HDFSFSPtr fs = createHDFSFS(builder.get());
|
||||
|
||||
auto res = LSWithRegexpMatching("/", fs, path_from_uri);
|
||||
Strings paths;
|
||||
expandSelector(path_from_uri, paths);
|
||||
|
||||
std::vector<StorageHDFS::PathWithInfo> res;
|
||||
|
||||
for (const auto & path : paths)
|
||||
{
|
||||
auto part_of_res = LSWithRegexpMatching("/", fs, path);
|
||||
res.insert(res.end(), part_of_res.begin(), part_of_res.end());
|
||||
}
|
||||
return res;
|
||||
}
|
||||
|
||||
|
@ -106,70 +106,6 @@ namespace ErrorCodes
|
||||
|
||||
namespace
|
||||
{
|
||||
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
|
||||
void expandSelector(const std::string & path, Strings & for_match_paths_expanded)
|
||||
{
|
||||
/// regexp for {expr1,expr2,expr3}, expr.. should be without "{", "}", "*" and ","
|
||||
static const re2::RE2 selector_regex(R"({([^{}*,]+,[^{}*]*[^{}*,])})");
|
||||
|
||||
std::string_view path_view(path);
|
||||
std::string_view matched;
|
||||
|
||||
if (RE2::FindAndConsume(&path_view, selector_regex, &matched))
|
||||
std::string buffer(matched);
|
||||
else
|
||||
{
|
||||
for_match_paths_expanded.push_back(path);
|
||||
return;
|
||||
}
|
||||
|
||||
Strings expanded_paths;
|
||||
|
||||
std::vector<size_t> anchor_positions = {};
|
||||
bool opened = false, closed = false;
|
||||
|
||||
for (std::string::const_iterator it = path.begin(); it != path.end(); it++)
|
||||
{
|
||||
if (*it == '{')
|
||||
{
|
||||
if (opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected '{{' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
|
||||
anchor_positions.push_back(std::distance(path.begin(), it));
|
||||
opened = true;
|
||||
}
|
||||
else if (*it == '}')
|
||||
{
|
||||
if (!opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected '}}' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
|
||||
anchor_positions.push_back(std::distance(path.begin(), it));
|
||||
closed = true;
|
||||
break;
|
||||
}
|
||||
else if (*it == ',')
|
||||
{
|
||||
if (!opened)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Unexpected ',' found in path '{}' at position {}.", path, std::distance(path.begin(), it));
|
||||
anchor_positions.push_back(std::distance(path.begin(), it));
|
||||
}
|
||||
}
|
||||
if (!opened || !closed)
|
||||
throw Exception(ErrorCodes::BAD_ARGUMENTS,
|
||||
"Invalid {{}} glob in path {}.", path);
|
||||
|
||||
std::string common_prefix = path.substr(0, anchor_positions[0]);
|
||||
std::string common_suffix = path.substr(anchor_positions[anchor_positions.size()-1] + 1);
|
||||
for (size_t i = 1; i < anchor_positions.size(); ++i)
|
||||
{
|
||||
std::string expanded_matcher = common_prefix
|
||||
+ path.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
|
||||
+ common_suffix;
|
||||
expandSelector(expanded_matcher, for_match_paths_expanded);
|
||||
}
|
||||
}
|
||||
|
||||
/* Recursive directory listing with matched paths as a result.
|
||||
* Have the same method in StorageHDFS.
|
||||
*/
|
||||
@ -189,7 +125,7 @@ void listFilesWithRegexpMatchingImpl(
|
||||
fs::path path = fs::canonical(path_for_ls + for_match);
|
||||
result.push_back(path.string());
|
||||
}
|
||||
catch (const std::exception &)
|
||||
catch (const std::exception &) // NOLINT
|
||||
{
|
||||
/// There is no such file, but we just ignore this.
|
||||
// throw Exception(ErrorCodes::FILE_DOESNT_EXIST, "File {} doesn't exist", for_match);
|
||||
|
Loading…
Reference in New Issue
Block a user