do all the same for HDFS + remove setting

This commit is contained in:
zvonand 2023-09-21 01:50:41 +02:00
parent 5f67788c03
commit a05bb020d4
3 changed files with 54 additions and 90 deletions

View File

@ -802,7 +802,6 @@ class IColumn;
M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \ M(Timezone, session_timezone, "", "This setting can be removed in the future due to potential caveats. It is experimental and is not suitable for production usage. The default timezone for current session or query. The server default timezone if empty.", 0) \
M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0)\ M(Bool, allow_create_index_without_type, false, "Allow CREATE INDEX query without TYPE. Query will be ignored. Made for SQL compatibility tests.", 0)\
M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \ M(Bool, create_index_ignore_unique, false, "Ignore UNIQUE keyword in CREATE UNIQUE INDEX. Made for SQL compatibility tests.", 0) \
M(Bool, ignore_access_denied_multidirectory_globs, false, "Ignore access denied errors when processing multi-directory globs for file & HDFS.", 0)\
// End of COMMON_SETTINGS // End of COMMON_SETTINGS
// Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS. // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

View File

@ -76,83 +76,58 @@ namespace ErrorCodes
} }
namespace namespace
{ {
/// Forward-declared to use in LSWithFoldedRegexpMatching w/o circular dependency. /// Forward-declare to use in expandSelector()
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls, std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(const String & path_for_ls,
const HDFSFSPtr & fs, const HDFSFSPtr & fs,
const String & for_match, const String & for_match);
bool ignore_access_denied_multidirectory_globs);
/* /// Process {a,b,c...} globs separately: don't match it against regex, but generate a,b,c strings instead.
* When `{...}` has any `/`s, it must be processed in a different way: std::vector<StorageHDFS::PathWithInfo> expandSelector(const String & path_for_ls,
* Basically, a path with globs is processed by LSWithRegexpMatching. In case it detects multi-dir glob {.../..., .../...}, const HDFSFSPtr & fs,
* LSWithFoldedRegexpMatching is in charge from now on. const String & for_match)
* It works a bit different: it still recursively goes through subdirectories, but does not match every directory to glob.
* Instead, it goes many levels down (until the approximate max_depth is reached) and compares this multi-dir path to a glob.
* StorageFile.cpp has the same logic.
*/
std::vector<StorageHDFS::PathWithInfo> LSWithFoldedRegexpMatching(const String & path_for_ls,
const HDFSFSPtr & fs,
const String & processed_suffix,
const String & suffix_with_globs,
re2::RE2 & matcher,
const size_t max_depth,
const size_t next_slash_after_glob_pos,
bool ignore_access_denied_multidirectory_globs)
{ {
/// We don't need to go all the way in every directory if max_depth is reached std::vector<size_t> anchor_positions = {};
/// as it is upper limit of depth by simply counting `/`s in curly braces bool opened = false, closed = false;
if (!max_depth)
return {};
HDFSFileInfo ls; for (std::string::const_iterator it = for_match.begin(); it != for_match.end(); it++)
ls.file_info = hdfsListDirectory(fs.get(), path_for_ls.data(), &ls.length);
if (ls.file_info == nullptr && errno != ENOENT) // NOLINT
{ {
// ignore file not found (as in LSWithRegexpMatching) if (*it == '{')
// keep throw other exception, libhdfs3 doesn't have function to get exception type, so use errno.
// ignore permission denied if ignore_access_denied_multidirectory_globs is true
if (!(ignore_access_denied_multidirectory_globs && errno == EACCES))
throw Exception(
ErrorCodes::ACCESS_DENIED, "Cannot list directory {}: {}", path_for_ls, String(hdfsGetLastError()));
}
std::vector<StorageHDFS::PathWithInfo> result;
if (!ls.file_info && ls.length > 0)
throw Exception(ErrorCodes::LOGICAL_ERROR, "file_info shouldn't be null");
for (int i = 0; i < ls.length; ++i)
{
const String full_path = String(ls.file_info[i].mName);
const size_t last_slash = full_path.rfind('/');
const String dir_or_file_name = full_path.substr(last_slash);
const bool is_directory = ls.file_info[i].mKind == 'D';
if (re2::RE2::FullMatch(processed_suffix + dir_or_file_name, matcher))
{ {
if (next_slash_after_glob_pos == std::string::npos) anchor_positions.push_back(std::distance(for_match.begin(), it));
{ opened = true;
result.emplace_back(StorageHDFS::PathWithInfo{
String(ls.file_info[i].mName),
StorageHDFS::PathInfo{ls.file_info[i].mLastMod, static_cast<size_t>(ls.file_info[i].mSize)}});
}
else
{
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(
fs::path(full_path) / "" , fs, suffix_with_globs.substr(next_slash_after_glob_pos),
ignore_access_denied_multidirectory_globs);
std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
}
} }
else if (is_directory) else if (*it == '}')
{ {
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithFoldedRegexpMatching( anchor_positions.push_back(std::distance(for_match.begin(), it));
fs::path(full_path), fs, processed_suffix + dir_or_file_name, suffix_with_globs, closed = true;
matcher, max_depth - 1, next_slash_after_glob_pos, ignore_access_denied_multidirectory_globs); break;
std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); }
else if (*it == ',')
{
if (!opened)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Unexpected ''' found in path '{}' at position {}.", for_match, std::distance(for_match.begin(), it));
anchor_positions.push_back(std::distance(for_match.begin(), it));
} }
} }
return result; if (!opened || !closed)
throw Exception(ErrorCodes::BAD_ARGUMENTS,
"Invalid {{}} glob in path {}.", for_match);
std::vector<StorageHDFS::PathWithInfo> ret = {};
std::string common_prefix = for_match.substr(0, anchor_positions[0]);
std::string common_suffix = for_match.substr(anchor_positions[anchor_positions.size()-1] + 1);
for (size_t i = 1; i < anchor_positions.size(); ++i)
{
std::ostringstream oss;
oss << common_prefix
<< for_match.substr(anchor_positions[i-1] + 1, (anchor_positions[i] - anchor_positions[i-1] - 1))
<< common_suffix;
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(path_for_ls, fs, oss.str());
ret.insert(ret.end(), result_part.begin(), result_part.end());
}
return ret;
} }
/* Recursive directory listing with matched paths as a result. /* Recursive directory listing with matched paths as a result.
@ -161,8 +136,7 @@ namespace
std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching( std::vector<StorageHDFS::PathWithInfo> LSWithRegexpMatching(
const String & path_for_ls, const String & path_for_ls,
const HDFSFSPtr & fs, const HDFSFSPtr & fs,
const String & for_match, const String & for_match)
bool ignore_access_denied_multidirectory_globs)
{ {
const size_t first_glob_pos = for_match.find_first_of("*?{"); const size_t first_glob_pos = for_match.find_first_of("*?{");
const bool has_glob = first_glob_pos != std::string::npos; const bool has_glob = first_glob_pos != std::string::npos;
@ -171,30 +145,28 @@ namespace
const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/' const String suffix_with_globs = for_match.substr(end_of_path_without_globs); /// begin with '/'
const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/' const String prefix_without_globs = path_for_ls + for_match.substr(1, end_of_path_without_globs); /// ends with '/'
size_t slashes_in_glob = 0; bool has_curly_braces = false;
const size_t next_slash_after_glob_pos = [&]() const size_t next_slash_after_glob_pos = [&]()
{ {
if (!has_glob) if (!has_glob)
return suffix_with_globs.find('/', 1); return suffix_with_globs.find('/', 1);
size_t in_curly = 0;
for (std::string::const_iterator it = ++suffix_with_globs.begin(); it != suffix_with_globs.end(); it++) for (std::string::const_iterator it = ++suffix_with_globs.begin(); it != suffix_with_globs.end(); it++)
{ {
if (*it == '{') if (*it == '{')
++in_curly;
else if (*it == '/')
{ {
if (in_curly) has_curly_braces = true;
++slashes_in_glob; return size_t(0);
else
return size_t(std::distance(suffix_with_globs.begin(), it));
} }
else if (*it == '}') else if (*it == '/')
--in_curly; return size_t(std::distance(suffix_with_globs.begin(), it));
} }
return std::string::npos; return std::string::npos;
}(); }();
if (has_curly_braces)
return expandSelector(path_for_ls, fs, for_match);
const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos); const std::string current_glob = suffix_with_globs.substr(0, next_slash_after_glob_pos);
re2::RE2 matcher(makeRegexpPatternFromGlobs(current_glob)); re2::RE2 matcher(makeRegexpPatternFromGlobs(current_glob));
@ -202,12 +174,6 @@ namespace
throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP, throw Exception(ErrorCodes::CANNOT_COMPILE_REGEXP,
"Cannot compile regex from glob ({}): {}", for_match, matcher.error()); "Cannot compile regex from glob ({}): {}", for_match, matcher.error());
if (slashes_in_glob)
{
return LSWithFoldedRegexpMatching(fs::path(prefix_without_globs), fs, "", suffix_with_globs, matcher,
slashes_in_glob, next_slash_after_glob_pos, ignore_access_denied_multidirectory_globs);
}
HDFSFileInfo ls; HDFSFileInfo ls;
ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length); ls.file_info = hdfsListDirectory(fs.get(), prefix_without_globs.data(), &ls.length);
if (ls.file_info == nullptr && errno != ENOENT) // NOLINT if (ls.file_info == nullptr && errno != ENOENT) // NOLINT
@ -239,7 +205,7 @@ namespace
if (re2::RE2::FullMatch(file_name, matcher)) if (re2::RE2::FullMatch(file_name, matcher))
{ {
std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs, std::vector<StorageHDFS::PathWithInfo> result_part = LSWithRegexpMatching(fs::path(full_path) / "", fs,
suffix_with_globs.substr(next_slash_after_glob_pos), ignore_access_denied_multidirectory_globs); suffix_with_globs.substr(next_slash_after_glob_pos));
/// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check. /// Recursion depth is limited by pattern. '*' works only for depth = 1, for depth = 2 pattern path is '*/*'. So we do not need additional check.
std::move(result_part.begin(), result_part.end(), std::back_inserter(result)); std::move(result_part.begin(), result_part.end(), std::back_inserter(result));
} }
@ -267,7 +233,7 @@ namespace
HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); HDFSBuilderWrapper builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef());
HDFSFSPtr fs = createHDFSFS(builder.get()); HDFSFSPtr fs = createHDFSFS(builder.get());
auto res = LSWithRegexpMatching("/", fs, path_from_uri, context->getSettingsRef().ignore_access_denied_multidirectory_globs); auto res = LSWithRegexpMatching("/", fs, path_from_uri);
return res; return res;
} }
} }

View File

@ -160,7 +160,6 @@ void expandSelector(const std::string & path_for_ls,
<< common_suffix; << common_suffix;
listFilesWithRegexpMatchingImpl(path_for_ls, oss.str(), total_bytes_to_read, result, recursive); listFilesWithRegexpMatchingImpl(path_for_ls, oss.str(), total_bytes_to_read, result, recursive);
} }
} }
/* Recursive directory listing with matched paths as a result. /* Recursive directory listing with matched paths as a result.