do all the same for HDFS + remove setting

This commit is contained in:
zvonand 2023-09-21 01:50:41 +02:00
parent 5f67788c03
commit a05bb020d4
3 changed files with 54 additions and 90 deletions

View File

@ -802,7 +802,6 @@ class IColumn;
M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \
M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0)\
M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \
M(Bool, ignore_access_denied_multidirectory_globs, false, "Ignore access denied errors when processing multi-directory globs for file & HDFS.", 0)\
// End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

View File

@ -76,83 +76,58 @@ namespace ErrorCodes
}
namespace
{
/// Forward-declared to use in LSWithFoldedRegexpMatching w/o circular dependency.
/// Forward-declare to use in expandSelector()
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & for_match,
bool ignore_access_denied_multidirectory_globs);
const String & for_match);
/*
* When `{...}` has any `/`s, it must be processed in a different way:
* Basically, a path with globs is processed by LSWithRegexpMatching. In case it detects multi-dir glob {.../..., .../...},
* LSWithFoldedRegexpMatching is in charge from now on.
* It works a bit different: it still recursively goes through subdirectories, but does not match every directory to glob.
* Instead, it goes many levels down (until the approximate max_depth is reached) and compares this multi-dir path to a glob.
* StorageFile.cpp has the same logic.
*/
std::vector<StorageHDFS::PathWithInfo> LSWithFoldedRegexpMatching(const String & path_for_ls,
/// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
std::vector<StorageHDFS::PathWithInfo> expandSelector(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & processed_suffix,
const String & suffix_with_globs,
re2::RE2 & matcher,
const size_t max_depth,
const size_t next_slash_after_glob_pos,
bool ignore_access_denied_multidirectory_globs)
const String & for_match)
{
/// We don't need to go all the way in every directory if max_depth is reached
/// as it is upper limit of depth by simply counting `/`s in curly braces
if (!max_depth)
return {};
std::vector<size_t> anchor_positions = {};
bool opened = false, closed = false;
HDFSFileInfo ls;
ls.file_info = hdfsListDirectory(fs.get(), path_for_ls.data(), &ls.length);
if (ls.file_info == nullptr && errno != ENOENT) // NOLINT
for (std::string::const_iterator it = for_match.begin(); it != for_match.end(); it++)
{
// ignore file not found (as in LSWithRegexpMatching)
// keep throw other exception, libhdfs3 doesn't have function to get exception type, so use errno.
// ignore permission denied if ignore_access_denied_multidirectory_globs is true
if (!(ignore_access_denied_multidirectory_globs && errno == EACCES))
throw Exception(
ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", path_for_ls, String(hdfsGetLastError()));
if (*it == '{')
{
anchor_positions.push_back(std::distance(for_match.begin(), it));
opened = true;
}
else if (*it == '}')
{
anchor_positions.push_back(std::distance(for_match.begin(), it));
closed = true;
break;
}
else if (*it == ',')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected ''' found in path '{}' at position {}.", for_match, std::distance(for_match.begin(), it));
anchor_positions.push_back(std::distance(for_match.begin(), it));
}
}
if (!opened || !closed)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Invalid {{}} glob in path {}.", for_match);
std::vector<StorageHDFS::PathWithInfo> result;
std::vector<StorageHDFS::PathWithInfo> ret = {};
if (!ls.file_info && ls.length > 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null");
for (int i = 0; i < ls.length; ++i)
std::string common_prefix = for_match.substr(0, anchor_positions[0]);
std::string common_suffix = for_match.substr(anchor_positions[anchor_positions.size()-1] + 1);
for (size_t i = 1; i < anchor_positions.size(); ++i)
{
const String full_path = String(ls.file_info[i].mName);
const size_t last_slash = full_path.rfind('/');
const String dir_or_file_name = full_path.substr(last_slash);
const bool is_directory = ls.file_info[i].mKind == 'D';
if (re2::RE2::FullMatch(processed_suffix + dir_or_file_name, matcher))
{
if (next_slash_after_glob_pos == std::string::npos)
{
result.emplace_back(StorageHDFS::PathWithInfo{
String(ls.file_info[i].mName),
StorageHDFS::PathInfo{ls.file_info[i].mLastMod, static_cast<size_t>(ls.file_info[i].mSize)}});
std::ostringstream oss;
oss << common_prefix
<< for_match.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
<< common_suffix;
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(path_for_ls, fs, oss.str());
ret.insert(ret.end(), result_part.begin(), result_part.end());
}
else
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(
fs::path(full_path) / "" , fs, suffix_with_globs.substr(next_slash_after_glob_pos),
ignore_access_denied_multidirectory_globs);
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
}
else if (is_directory)
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithFoldedRegexpMatching(
fs::path(full_path), fs, processed_suffix + dir_or_file_name, suffix_with_globs,
matcher, max_depth - 1, next_slash_after_glob_pos, ignore_access_denied_multidirectory_globs);
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
}
return result;
return ret;
}
/* Recursive directory listing with matched paths as a result.
@ -161,8 +136,7 @@ namespace
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(
const String & path_for_ls,
const HDFSFSPtr & fs,
const String & for_match,
bool ignore_access_denied_multidirectory_globs)
const String & for_match)
{
const size_t first_glob_pos = for_match.find_first_of("*?{");
const bool has_glob = first_glob_pos != std::string::npos;
@ -171,30 +145,28 @@ namespace
const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/'
const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/'
size_t slashes_in_glob = 0;
bool has_curly_braces = false;
const size_t next_slash_after_glob_pos = [&]()
{
if (!has_glob)
return suffix_with_globs.find('/', 1);
size_t in_curly = 0;
for (std::string::const_iterator it = ++suffix_with_globs.begin(); it != suffix_with_globs.end(); it++)
{
if (*it == '{')
++in_curly;
else if (*it == '/')
{
if (in_curly)
++slashes_in_glob;
else
return size_t(std::distance(suffix_with_globs.begin(), it));
has_curly_braces = true;
return size_t(0);
}
else if (*it == '}')
--in_curly;
else if (*it == '/')
return size_t(std::distance(suffix_with_globs.begin(), it));
}
return std::string::npos;
}();
if (has_curly_braces)
return expandSelector(path_for_ls, fs, for_match);
const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos);
re2::RE2 matcher(makeRegexpPatternFromGlobs(current_glob));
@ -202,12 +174,6 @@ namespace
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP,
"Cannot compile regex from glob ({}): {}", for_match, matcher.error());
if (slashes_in_glob)
{
return LSWithFoldedRegexpMatching(fs::path(prefix_without_globs), fs, "", suffix_with_globs, matcher,
slashes_in_glob, next_slash_after_glob_pos, ignore_access_denied_multidirectory_globs);
}
HDFSFileInfo ls;
ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length);
if (ls.file_info == nullptr && errno != ENOENT) // NOLINT
@ -239,7 +205,7 @@ namespace
if (re2::RE2::FullMatch(file_name, matcher))
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs,
suffix_with_globs.substr(next_slash_after_glob_pos), ignore_access_denied_multidirectory_globs);
suffix_with_globs.substr(next_slash_after_glob_pos));
/// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
@ -267,7 +233,7 @@ namespace
HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef());
HDFSFSPtr fs = createHDFSFS(builder.get());
auto res = LSWithRegexpMatching("/", fs, path_from_uri, context->getSettingsRef().ignore_access_denied_multidirectory_globs);
auto res = LSWithRegexpMatching("/", fs, path_from_uri);
return res;
}
}

View File

@ -160,7 +160,6 @@ void expandSelector(const std::string & path_for_ls,
<< common_suffix;
listFilesWithRegexpMatchingImpl(path_for_ls, oss.str(), total_bytes_to_read, result, recursive);
}
}
/* Recursive directory listing with matched paths as a result.