diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index 49ab822c738..23c174f9e7c 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -151,6 +151,15 @@ Names NamesAndTypesList::getNames() const return res; } +NameSet NamesAndTypesList::getNameSet() const +{ + NameSet res; + res.reserve(size()); + for (const NameAndTypePair & column : *this) + res.insert(column.name); + return res; +} + DataTypes NamesAndTypesList::getTypes() const { DataTypes res; diff --git a/src/Core/NamesAndTypes.h b/src/Core/NamesAndTypes.h index 29f40c45938..7f874172df3 100644 --- a/src/Core/NamesAndTypes.h +++ b/src/Core/NamesAndTypes.h @@ -100,6 +100,7 @@ public: void getDifference(const NamesAndTypesList & rhs, NamesAndTypesList & deleted, NamesAndTypesList & added) const; Names getNames() const; + NameSet getNameSet() const; DataTypes getTypes() const; /// Remove columns which names are not in the `names`. diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 3ae884bf98c..6ad453bc0de 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -131,7 +131,7 @@ std::shared_ptr StorageObjectStorageSourc else { ConfigurationPtr copy_configuration = configuration->clone(); - auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, local_context); if (filter_dag) { auto keys = configuration->getPaths(); @@ -142,7 +142,7 @@ std::shared_ptr StorageObjectStorageSourc VirtualColumnUtils::buildSetsForDAG(*filter_dag, local_context); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns, local_context); copy_configuration->setPaths(keys); } @@ -489,6 +489,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , virtual_columns(virtual_columns_) , throw_on_zero_files_match(throw_on_zero_files_match_) , read_keys(read_keys_) + , local_context(context_) , file_progress_callback(file_progress_callback_) { if (configuration->isNamespaceWithGlobs()) @@ -510,7 +511,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } recursive = key_with_globs == "/**"; - if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns)) + if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, local_context)) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, getContext()); filter_expr = std::make_shared(std::move(*filter_dag)); @@ -585,7 +586,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne for (const auto & object_info : new_batch) paths.push_back(getUniqueStoragePathIdentifier(*configuration, *object_info, false)); - VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns, local_context); LOG_TEST(logger, "Filtered files: {} -> {}", paths.size(), new_batch.size()); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 7ae7a2358e9..8ee3b023638 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -220,6 +220,7 @@ private: bool is_finished = false; bool first_iteration = true; std::mutex next_mutex; + const ContextPtr local_context; std::function file_progress_callback; }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 036a01914cf..55bc8083ec8 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1141,13 +1141,13 @@ StorageFileSource::FilesIterator::FilesIterator( { std::optional filter_dag; if (!distributed_processing && !archive_info && !files.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, context_); if (filter_dag) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, context_); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns, context_); } } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 1c7439f9a55..da0911ec20b 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -227,7 +227,7 @@ public: std::optional filter_dag; if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, context); if (filter_dag) { @@ -238,7 +238,7 @@ public: VirtualColumnUtils::buildSetsForDAG(*filter_dag, context); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns, context); } } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index f0d276e4e56..14bf8ac8c13 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -46,6 +47,7 @@ #include "Functions/IFunction.h" #include "Functions/IFunctionAdaptors.h" #include "Functions/indexHint.h" +#include #include #include #include @@ -124,9 +126,18 @@ void filterBlockWithExpression(const ExpressionActionsPtr & actions, Block & blo } } +NamesAndTypesList getCommonVirtualsForFileLikeStorage() +{ + return {{"_path", std::make_shared(std::make_shared())}, + {"_file", std::make_shared(std::make_shared())}, + {"_size", makeNullable(std::make_shared())}, + {"_time", makeNullable(std::make_shared())}, + {"_etag", std::make_shared(std::make_shared())}}; +} + NameSet getVirtualNamesForFileLikeStorage() { - return {"_path", "_file", "_size", "_time", "_etag"}; + return getCommonVirtualsForFileLikeStorage().getNameSet(); } std::unordered_map parseHivePartitioningKeysAndValues(const String & path) @@ -154,8 +165,10 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto { VirtualColumnsDescription desc; - auto add_virtual = [&](const auto & name, const auto & type) + auto add_virtual = [&](const NameAndTypePair & pair) { + const auto & name = pair.getNameInStorage(); + const auto & type = pair.getTypeInStorage(); if (storage_columns.has(name)) { if (!context->getSettingsRef().use_hive_partitioning) @@ -172,11 +185,8 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto desc.addEphemeral(name, type, ""); }; - add_virtual("_path", std::make_shared(std::make_shared())); - add_virtual("_file", std::make_shared(std::make_shared())); - add_virtual("_size", makeNullable(std::make_shared())); - add_virtual("_time", makeNullable(std::make_shared())); - add_virtual("_etag", std::make_shared(std::make_shared())); + for (const auto & item : getCommonVirtualsForFileLikeStorage()) + add_virtual(item); if (context->getSettingsRef().use_hive_partitioning) { @@ -188,16 +198,16 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto if (type == nullptr) type = std::make_shared(); if (type->canBeInsideLowCardinality()) - add_virtual(item.first, std::make_shared(type)); + add_virtual({item.first, std::make_shared(type)}); else - add_virtual(item.first, type); + add_virtual({item.first, type}); } } return desc; } -static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx) +static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx, const FormatSettings & format_settings, bool use_hive_partitioning) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -214,18 +224,34 @@ static void addPathAndFileToVirtualColumns(Block & block, const String & path, s block.getByName("_file").column->assumeMutableRef().insert(file); } + if (use_hive_partitioning) + { + auto keys_and_values = parseHivePartitioningKeysAndValues(path); + for (const auto & [key, value] : keys_and_values) + { + if (const auto * column = block.findByName(key)) + { + ReadBufferFromString buf(value); + column->type->getDefaultSerialization()->deserializeWholeText(column->column->assumeMutableRef(), buf, format_settings); + } + } + } + block.getByName("_idx").column->assumeMutableRef().insert(idx); } -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns) +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { if (!predicate || virtual_columns.empty()) return {}; Block block; + NameSet common_virtuals; + if (context->getSettingsRef().use_hive_partitioning) + common_virtuals = getVirtualNamesForFileLikeStorage(); for (const auto & column : virtual_columns) { - if (column.name == "_file" || column.name == "_path") + if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name)) block.insert({column.type->createColumn(), column.type, column.name}); } @@ -233,18 +259,19 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr return splitFilterDagForAllowedInputs(predicate, &block); } -ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns) +ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; + NameSet common_virtuals = getVirtualNamesForFileLikeStorage(); for (const auto & column : virtual_columns) { - if (column.name == "_file" || column.name == "_path") + if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name)) block.insert({column.type->createColumn(), column.type, column.name}); } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); for (size_t i = 0; i != paths.size(); ++i) - addPathAndFileToVirtualColumns(block, paths[i], i); + addPathAndFileToVirtualColumns(block, paths[i], i, getFormatSettings(context), context->getSettingsRef().use_hive_partitioning); filterBlockWithExpression(actions, block); diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 6aa08b2aef2..a9c46569a83 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -75,14 +75,14 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage( const std::string & sample_path = "", std::optional format_settings_ = std::nullopt); -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context); -ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns); +ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context); template -void filterByPathOrFile(std::vector & sources, const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns) +void filterByPathOrFile(std::vector & sources, const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { - auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns); + auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns, context); const auto & indexes = typeid_cast(*indexes_column).getData(); if (indexes.size() == sources.size()) return; diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 0fbc1fb556e..bb6a345c6ec 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -33,8 +33,8 @@ Cross Elizabeth [1,2,3] 42.42 Array(Int64) LowCardinality(Float64) 101 -2070 -2070 +2071 +2071 b 1 1 diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.reference b/tests/queries/0_stateless/03231_hive_partitioning_filtering.reference new file mode 100644 index 00000000000..a9e2f17562a --- /dev/null +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.reference @@ -0,0 +1,6 @@ +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh new file mode 100755 index 00000000000..30ae5b01a98 --- /dev/null +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -0,0 +1,72 @@ +#!/usr/bin/env bash +# Tags: no-fasttest + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_DIR=$USER_FILES_PATH/$CLICKHOUSE_TEST_UNIQUE_NAME +mkdir -p $DATA_DIR +cp -r $CURDIR/data_hive/ $DATA_DIR + +$CLICKHOUSE_CLIENT --query_id="test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1, optimize_count_from_files=0; +" + +$CLICKHOUSE_CLIENT --query " + SYSTEM FLUSH LOGS; +" + +for _ in {1..5}; do + count=$( $CLICKHOUSE_CLIENT --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log + WHERE query_id='test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME' AND + current_database = currentDatabase() and type='QueryFinish';" ) + if [[ "$count" == "1" ]]; then + echo "1" + break + fi + sleep 1 +done + +$CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1, optimize_count_from_files=0; +" + +$CLICKHOUSE_CLIENT --query " + SYSTEM FLUSH LOGS; +" + +for _ in {1..5}; do + count=$( $CLICKHOUSE_CLIENT --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log + WHERE query_id='test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME' AND + current_database = currentDatabase() and type='QueryFinish';" ) + if [[ "$count" == "1" ]]; then + echo "1" + break + fi + sleep 1 +done + +$CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1, optimize_count_from_files=0; +" + +$CLICKHOUSE_CLIENT --query " + SYSTEM FLUSH LOGS; +" + +for _ in {1..5}; do + count=$( $CLICKHOUSE_CLIENT --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log + WHERE query_id='test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME' AND + current_database = currentDatabase() and type='QueryFinish';" ) + if [[ "$count" == "1" ]]; then + echo "1" + break + fi + sleep 1 +done + +rm -rf $DATA_DIR diff --git a/tests/queries/0_stateless/03232_file_path_normalizing.reference b/tests/queries/0_stateless/03232_file_path_normalizing.reference index 953db2c5dfe..3b41cf34056 100644 --- a/tests/queries/0_stateless/03232_file_path_normalizing.reference +++ b/tests/queries/0_stateless/03232_file_path_normalizing.reference @@ -1 +1 @@ -data_hive/partitioning/column0=Elizabeth/sample.parquet +data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet diff --git a/tests/queries/0_stateless/03232_file_path_normalizing.sh b/tests/queries/0_stateless/03232_file_path_normalizing.sh index e7a7a65be51..add6049f9b5 100755 --- a/tests/queries/0_stateless/03232_file_path_normalizing.sh +++ b/tests/queries/0_stateless/03232_file_path_normalizing.sh @@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_LOCAL -q "SELECT substring(_path, position(_path, 'data_hive')) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') LIMIT 1;" +$CLICKHOUSE_LOCAL -q "SELECT substring(_path, position(_path, 'data_hive')) FROM file('$CURDIR/data_hive/partitioning/non_existing_column=*/sample.parquet') LIMIT 1;" diff --git a/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/sample.parquet new file mode 100644 index 00000000000..9b6a78cf8cc Binary files /dev/null and b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/sample.parquet differ diff --git a/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,4]/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,4]/sample.parquet new file mode 100644 index 00000000000..9b6a78cf8cc Binary files /dev/null and b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,4]/sample.parquet differ diff --git a/tests/queries/0_stateless/data_hive/partitioning/column0=Stacy/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/column0=Stacy/sample.parquet new file mode 100644 index 00000000000..9b6a78cf8cc Binary files /dev/null and b/tests/queries/0_stateless/data_hive/partitioning/column0=Stacy/sample.parquet differ diff --git a/tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv b/tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv new file mode 100644 index 00000000000..936d995cc64 --- /dev/null +++ b/tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv @@ -0,0 +1,5 @@ +_login_email,_identifier,_first_name,_last_name +laura@example.com,2070,Laura,Grey +craig@example.com,4081,Craig,Johnson +mary@example.com,9346,Mary,Jenkins +jamie@example.com,5079,Jamie,Smith