From 189cbe25fe4321f020a9f7b72c901d7358c1f1af Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:28:18 +0200 Subject: [PATCH 01/33] init --- .../StorageObjectStorageSource.cpp | 12 +++--- .../StorageObjectStorageSource.h | 1 + src/Storages/StorageFile.cpp | 4 +- src/Storages/StorageURL.cpp | 7 +++- src/Storages/VirtualColumnUtils.cpp | 37 +++++++++++++++--- src/Storages/VirtualColumnUtils.h | 8 ++-- ...3231_hive_partitioning_filtering.reference | 6 +++ .../03231_hive_partitioning_filtering.sh | 29 ++++++++++++++ .../partitioning/array=[1,2,3]/sample.parquet | Bin 0 -> 1308 bytes .../partitioning/array=[1,2,4]/sample.parquet | Bin 0 -> 1308 bytes .../partitioning/column0=Stacy/sample.parquet | Bin 0 -> 1308 bytes .../partitioning/identifier=2071/email.csv | 5 +++ 12 files changed, 91 insertions(+), 18 deletions(-) create mode 100644 tests/queries/0_stateless/03231_hive_partitioning_filtering.reference create mode 100644 tests/queries/0_stateless/03231_hive_partitioning_filtering.sh create mode 100644 tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/array=[1,2,4]/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/column0=Stacy/sample.parquet create mode 100644 tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 04e319cd0b8..0d4471e3bda 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -131,10 +131,11 @@ std::shared_ptr StorageObjectStorageSourc else { ConfigurationPtr copy_configuration = configuration->clone(); - auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + auto keys = configuration->getPaths(); + String partitioning_path = fs::path(configuration->getNamespace()) / keys[0]; + auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, partitioning_path, local_context); if (filter_dag) { - auto keys = configuration->getPaths(); std::vector paths; paths.reserve(keys.size()); for (const auto & key : keys) @@ -142,7 +143,7 @@ std::shared_ptr StorageObjectStorageSourc VirtualColumnUtils::buildSetsForDAG(*filter_dag, local_context); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(keys, paths, actions, virtual_columns, local_context); copy_configuration->setPaths(keys); } @@ -492,6 +493,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( , virtual_columns(virtual_columns_) , throw_on_zero_files_match(throw_on_zero_files_match_) , read_keys(read_keys_) + , local_context(context_) , file_progress_callback(file_progress_callback_) { if (configuration->isNamespaceWithGlobs()) @@ -513,7 +515,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } recursive = key_with_globs == "/**"; - if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns)) + if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, key_with_globs, local_context)) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, getContext()); filter_expr = std::make_shared(std::move(*filter_dag)); @@ -588,7 +590,7 @@ StorageObjectStorage::ObjectInfoPtr StorageObjectStorageSource::GlobIterator::ne for (const auto & object_info : new_batch) paths.push_back(getUniqueStoragePathIdentifier(*configuration, *object_info, false)); - VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(new_batch, paths, filter_expr, virtual_columns, local_context); LOG_TEST(logger, "Filtered files: {} -> {}", paths.size(), new_batch.size()); } diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.h b/src/Storages/ObjectStorage/StorageObjectStorageSource.h index 7ae7a2358e9..8ee3b023638 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.h +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.h @@ -220,6 +220,7 @@ private: bool is_finished = false; bool first_iteration = true; std::mutex next_mutex; + const ContextPtr local_context; std::function file_progress_callback; }; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 50294df32a4..639af41c1cd 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -1140,13 +1140,13 @@ StorageFileSource::FilesIterator::FilesIterator( { std::optional filter_dag; if (!distributed_processing && !archive_info && !files.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, files[0], context_); if (filter_dag) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, context_); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(files, files, actions, virtual_columns, context_); } } diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index fc1354b780a..572c4f20fa3 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -214,7 +214,10 @@ public: std::optional filter_dag; if (!uris.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns); + { + String partitioning_path = Poco::URI(uris[0]).getPath(); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, partitioning_path, context); + } if (filter_dag) { @@ -225,7 +228,7 @@ public: VirtualColumnUtils::buildSetsForDAG(*filter_dag, context); auto actions = std::make_shared(std::move(*filter_dag)); - VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns); + VirtualColumnUtils::filterByPathOrFile(uris, paths, actions, virtual_columns, context); } } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index f0d276e4e56..788aeb66657 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -46,6 +46,7 @@ #include "Functions/IFunction.h" #include "Functions/IFunctionAdaptors.h" #include "Functions/indexHint.h" +#include #include #include #include @@ -197,7 +198,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto return desc; } -static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx) +static void addFilterDataToVirtualColumns(Block & block, const String & path, size_t idx, ColumnsWithTypeAndName partitioning_keys, const ContextPtr & context) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -214,18 +215,31 @@ static void addPathAndFileToVirtualColumns(Block & block, const String & path, s block.getByName("_file").column->assumeMutableRef().insert(file); } + for (const auto & item : partitioning_keys) + { + if (block.has(item.name)) + { + auto column = block.getByName(item.name).column; + ReadBufferFromString buf(item.column->getDataAt(0).toView()); + item.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); + } + } + block.getByName("_idx").column->assumeMutableRef().insert(idx); } -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns) +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const String & path, const ContextPtr & context) { if (!predicate || virtual_columns.empty()) return {}; Block block; + std::unordered_map keys; + if (context->getSettingsRef().use_hive_partitioning) + keys = parseHivePartitioningKeysAndValues(path); for (const auto & column : virtual_columns) { - if (column.name == "_file" || column.name == "_path") + if (column.name == "_file" || column.name == "_path" || keys.contains(column.name)) block.insert({column.type->createColumn(), column.type, column.name}); } @@ -233,18 +247,31 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr return splitFilterDagForAllowedInputs(predicate, &block); } -ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns) +ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; + std::unordered_map keys; + ColumnsWithTypeAndName partitioning_columns; + if (context->getSettingsRef().use_hive_partitioning) + keys = parseHivePartitioningKeysAndValues(paths[0]); for (const auto & column : virtual_columns) { if (column.name == "_file" || column.name == "_path") block.insert({column.type->createColumn(), column.type, column.name}); + + auto it = keys.find(column.name); + if (it != keys.end()) + { + auto c = std::make_shared()->createColumn(); + c->insert(it->second); + block.insert({column.type->createColumn(), column.type, column.name}); + partitioning_columns.push_back({c->getPtr(), column.type, column.name}); + } } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); for (size_t i = 0; i != paths.size(); ++i) - addPathAndFileToVirtualColumns(block, paths[i], i); + addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns, context); filterBlockWithExpression(actions, block); diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index 6aa08b2aef2..ecfe44a1956 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -75,14 +75,14 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage( const std::string & sample_path = "", std::optional format_settings_ = std::nullopt); -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns); +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const String & path, const ContextPtr & context); -ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns); +ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context); template -void filterByPathOrFile(std::vector & sources, const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns) +void filterByPathOrFile(std::vector & sources, const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { - auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns); + auto indexes_column = getFilterByPathAndFileIndexes(paths, actions, virtual_columns, context); const auto & indexes = typeid_cast(*indexes_column).getData(); if (indexes.size() == sources.size()) return; diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.reference b/tests/queries/0_stateless/03231_hive_partitioning_filtering.reference new file mode 100644 index 00000000000..a9e2f17562a --- /dev/null +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.reference @@ -0,0 +1,6 @@ +1 +1 +1 +1 +1 +1 diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh new file mode 100644 index 00000000000..719fed5bdaa --- /dev/null +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -0,0 +1,29 @@ +#!/usr/bin/env bash + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh + +${CLICKHOUSE_CLIENT} --query_id="test_03231_1" --query " + SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth'; +" + +${CLICKHOUSE_CLIENT} --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1'; +" + +${CLICKHOUSE_CLIENT} --query_id="test_03231_2" --query " + SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070; +" + +${CLICKHOUSE_CLIENT} --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2'; +" + +${CLICKHOUSE_CLIENT} --query_id="test_03231_3" --query " + SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3]; +" + +${CLICKHOUSE_CLIENT} --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3'; +" diff --git a/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,3]/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,4]/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/array=[1,2,4]/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/column0=Stacy/sample.parquet b/tests/queries/0_stateless/data_hive/partitioning/column0=Stacy/sample.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9b6a78cf8cc7cd3ece15e13c9b2f222c8f09b81e GIT binary patch literal 1308 zcmWG=3^EjD5Z%Hr`iosh^b{kI%_hpmz#!kv!2kyTLxU6Z9~tnZHa*@opEc(Au?K1g zOD4aYo#~scS*oJ`_R8gD$~^!1^Jl8v?6d#uZT@&1Z&h*OEsK+iS@vM( z^NvMD|`UJH2qG^xTWJ-dT6$7G6DVTky7Woy5#*nvWVEJpR{CJ{Fy0- zE8ux@_5^8x!?dEIRau&2MyW=j!5h*xtj<|H$T%nI_ zrsjz?W}YW@dt8{DRBI|`*(jU(m2ZmM@u#NQ!s{)z%{yLgtZF$)cAddC?xOT5D^_mz z-x7J9sr1v$v$K{(^`5h;Sz-1gc2*AGUh7}8F0R?}-B&E(IrH;G`GUhY z?q@1K*wQW0otd;iYI&}N?~AIE{%tkCroWN7t$#4bGw~KP0|PJ-eBc+|z=1tM#0JF{ zU3TEfTh6OHr)jl`=8?k_CV5&tiR=x1?{{sI`|Af*?oUEIqS_tiuleY8e||}EY3bMB zzp9qaKhIf|e>9xYs^&t{(WWC|y8X+=Uc{}=?T>Xh_5JxVk(1Vsywf&)T&i$tu2}yJ zsTDW>>9!Q_yZT7oEaCof4t43QdkFv1JFG`q9?h6g zxTpBgk6%&qwlli6{)!hkc#l_C=)}P;-Ys+NvjP>bYG~cCGCw}YQ1x-0z@w1)u@}^n zTV#|>Z7-{GtbTT=rr=<)~?``+iTxh4l+3|MS-tdVRHm+9w`h0!z=3knV zrSnX_{WmK}KJ?@4(a#30zmF(AmC{eNN7s8Lx}H>x1pMHFk2oys;%$ zvXN_R)m$dd8M|y^7q?Bh-x;&%icdYm3!CL}KR{`PNz%rYL4r4>G&wsZDZV&4BQ-Zs zl!ZZ*N0mu}Jvl$8G&j!xn4o|vkwidc4g-VODMm>dNgXu?8BrcdQ3gqbdKRFR7=zd% z4mA!N3D&gCqT&(>R>!2I%v3Q34HQ1GkiyV!C<@hogF|f<&;XY3{QMLNR)w6z;u4^K eWG+xU(4JF_Y8(t2Y%V}QxHvIf1_}lM%S8a*|2_@? literal 0 HcmV?d00001 diff --git a/tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv b/tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv new file mode 100644 index 00000000000..936d995cc64 --- /dev/null +++ b/tests/queries/0_stateless/data_hive/partitioning/identifier=2071/email.csv @@ -0,0 +1,5 @@ +_login_email,_identifier,_first_name,_last_name +laura@example.com,2070,Laura,Grey +craig@example.com,4081,Craig,Johnson +mary@example.com,9346,Mary,Jenkins +jamie@example.com,5079,Jamie,Smith From c6804122cb8785e744b9fa472c67ed95b7525bdf Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:52:29 +0200 Subject: [PATCH 02/33] fix shell --- .../queries/0_stateless/03231_hive_partitioning_filtering.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 719fed5bdaa..435f6fe4c4e 100644 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -1,8 +1,8 @@ #!/usr/bin/env bash -CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh -. "$CUR_DIR"/../shell_config.sh +. "$CURDIR"/../shell_config.sh ${CLICKHOUSE_CLIENT} --query_id="test_03231_1" --query " SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth'; From 4eca00a66635b167e35ad58be0beb176710c5b9b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 27 Aug 2024 18:10:41 +0200 Subject: [PATCH 03/33] fix style --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 435f6fe4c4e..396cfc9da26 100644 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -9,7 +9,7 @@ ${CLICKHOUSE_CLIENT} --query_id="test_03231_1" --query " " ${CLICKHOUSE_CLIENT} --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1'; + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' WHERE current_database = currentDatabase(); " ${CLICKHOUSE_CLIENT} --query_id="test_03231_2" --query " @@ -17,7 +17,7 @@ ${CLICKHOUSE_CLIENT} --query_id="test_03231_2" --query " " ${CLICKHOUSE_CLIENT} --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2'; + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' WHERE current_database = currentDatabase(); " ${CLICKHOUSE_CLIENT} --query_id="test_03231_3" --query " @@ -25,5 +25,5 @@ ${CLICKHOUSE_CLIENT} --query_id="test_03231_3" --query " " ${CLICKHOUSE_CLIENT} --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3'; + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' WHERE current_database = currentDatabase(); " From 2741bf00e4c19d1e548cdda62452b078c7a65a05 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 27 Aug 2024 16:53:14 +0000 Subject: [PATCH 04/33] chmod +x --- tests/queries/0_stateless/03231_hive_partitioning_filtering.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/03231_hive_partitioning_filtering.sh diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh old mode 100644 new mode 100755 From 9133505952ead54df50aeca643e947823560371b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:16:05 +0200 Subject: [PATCH 05/33] fix the test --- .../03231_hive_partitioning_filtering.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 396cfc9da26..3a1f51bf0cc 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -4,26 +4,30 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -${CLICKHOUSE_CLIENT} --query_id="test_03231_1" --query " +$CLICKHOUSE_LOCAL --query_id="test_03231_1" --query " + SET use_hive_partitioning = 1; +" + +$CLICKHOUSE_LOCAL --query_id="test_03231_1" --query " SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth'; " -${CLICKHOUSE_CLIENT} --query " +$CLICKHOUSE_LOCAL --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' WHERE current_database = currentDatabase(); " -${CLICKHOUSE_CLIENT} --query_id="test_03231_2" --query " +$CLICKHOUSE_LOCAL --query_id="test_03231_2" --query " SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070; " -${CLICKHOUSE_CLIENT} --query " +$CLICKHOUSE_LOCAL --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' WHERE current_database = currentDatabase(); " -${CLICKHOUSE_CLIENT} --query_id="test_03231_3" --query " +$CLICKHOUSE_LOCAL --query_id="test_03231_3" --query " SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3]; " -${CLICKHOUSE_CLIENT} --query " +$CLICKHOUSE_LOCAL --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' WHERE current_database = currentDatabase(); " From 60c6eb26100d61b0a1f642a0bf00293725d91408 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:42:47 +0200 Subject: [PATCH 06/33] trying to fix the test --- .../03231_hive_partitioning_filtering.sh | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 3a1f51bf0cc..e545f53f257 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -5,29 +5,25 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) . "$CURDIR"/../shell_config.sh $CLICKHOUSE_LOCAL --query_id="test_03231_1" --query " - SET use_hive_partitioning = 1; -" - -$CLICKHOUSE_LOCAL --query_id="test_03231_1" --query " - SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth'; + SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; " $CLICKHOUSE_LOCAL --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' WHERE current_database = currentDatabase(); + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase(); " $CLICKHOUSE_LOCAL --query_id="test_03231_2" --query " - SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070; + SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; " $CLICKHOUSE_LOCAL --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' WHERE current_database = currentDatabase(); + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase(); " $CLICKHOUSE_LOCAL --query_id="test_03231_3" --query " - SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3]; + SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; " $CLICKHOUSE_LOCAL --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' WHERE current_database = currentDatabase(); + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' AND current_database = currentDatabase(); " From dc97bd6b9280b8640058c759357b3c12f520dfed Mon Sep 17 00:00:00 2001 From: yariks5s Date: Wed, 28 Aug 2024 17:22:47 +0000 Subject: [PATCH 07/33] review + testing the code --- src/Core/NamesAndTypes.cpp | 9 ++++ src/Core/NamesAndTypes.h | 1 + .../StorageObjectStorageSource.cpp | 7 ++-- src/Storages/StorageFile.cpp | 9 +++- src/Storages/StorageURL.cpp | 5 +-- src/Storages/VirtualColumnUtils.cpp | 41 +++++++++++-------- src/Storages/VirtualColumnUtils.h | 2 +- 7 files changed, 48 insertions(+), 26 deletions(-) diff --git a/src/Core/NamesAndTypes.cpp b/src/Core/NamesAndTypes.cpp index 49ab822c738..23c174f9e7c 100644 --- a/src/Core/NamesAndTypes.cpp +++ b/src/Core/NamesAndTypes.cpp @@ -151,6 +151,15 @@ Names NamesAndTypesList::getNames() const return res; } +NameSet NamesAndTypesList::getNameSet() const +{ + NameSet res; + res.reserve(size()); + for (const NameAndTypePair & column : *this) + res.insert(column.name); + return res; +} + DataTypes NamesAndTypesList::getTypes() const { DataTypes res; diff --git a/src/Core/NamesAndTypes.h b/src/Core/NamesAndTypes.h index 29f40c45938..7f874172df3 100644 --- a/src/Core/NamesAndTypes.h +++ b/src/Core/NamesAndTypes.h @@ -100,6 +100,7 @@ public: void getDifference(const NamesAndTypesList & rhs, NamesAndTypesList & deleted, NamesAndTypesList & added) const; Names getNames() const; + NameSet getNameSet() const; DataTypes getTypes() const; /// Remove columns which names are not in the `names`. diff --git a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp index 0d4471e3bda..00155aee4c3 100644 --- a/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp +++ b/src/Storages/ObjectStorage/StorageObjectStorageSource.cpp @@ -131,11 +131,10 @@ std::shared_ptr StorageObjectStorageSourc else { ConfigurationPtr copy_configuration = configuration->clone(); - auto keys = configuration->getPaths(); - String partitioning_path = fs::path(configuration->getNamespace()) / keys[0]; - auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, partitioning_path, local_context); + auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, local_context); if (filter_dag) { + auto keys = configuration->getPaths(); std::vector paths; paths.reserve(keys.size()); for (const auto & key : keys) @@ -515,7 +514,7 @@ StorageObjectStorageSource::GlobIterator::GlobIterator( } recursive = key_with_globs == "/**"; - if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, key_with_globs, local_context)) + if (auto filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, local_context)) { VirtualColumnUtils::buildSetsForDAG(*filter_dag, getContext()); filter_expr = std::make_shared(std::move(*filter_dag)); diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 639af41c1cd..265a03242b5 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -355,14 +355,18 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user { fs::path user_files_absolute_path = fs::weakly_canonical(user_files_path); fs::path fs_table_path(table_path); + LOG_TRACE(getLogger("testing the paths"), "{} , {}", user_files_absolute_path, fs_table_path); if (fs_table_path.is_relative()) fs_table_path = user_files_absolute_path / fs_table_path; + LOG_TRACE(getLogger("testing the paths"), "fs_table_path = {}", fs_table_path); + Strings paths; /// Do not use fs::canonical or fs::weakly_canonical. /// Otherwise it will not allow to work with symlinks in `user_files_path` directory. String path = fs::absolute(fs_table_path).lexically_normal(); /// Normalize path. + LOG_TRACE(getLogger("testing the paths"), "path = {}", path); bool can_be_directory = true; if (path.find(PartitionedSink::PARTITION_ID_WILDCARD) != std::string::npos) @@ -395,7 +399,10 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user } for (const auto & cur_path : paths) + { checkCreationIsAllowed(context, user_files_absolute_path, cur_path, can_be_directory); + LOG_TRACE(getLogger("checking all paths"), "{}", cur_path); + } return paths; } @@ -1140,7 +1147,7 @@ StorageFileSource::FilesIterator::FilesIterator( { std::optional filter_dag; if (!distributed_processing && !archive_info && !files.empty()) - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, files[0], context_); + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, context_); if (filter_dag) { diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 572c4f20fa3..ab72d6a3a5a 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -214,10 +214,7 @@ public: std::optional filter_dag; if (!uris.empty()) - { - String partitioning_path = Poco::URI(uris[0]).getPath(); - filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, partitioning_path, context); - } + filter_dag = VirtualColumnUtils::createPathAndFileFilterDAG(predicate, virtual_columns, context); if (filter_dag) { diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 788aeb66657..2bd7325a789 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include @@ -125,9 +126,18 @@ void filterBlockWithExpression(const ExpressionActionsPtr & actions, Block & blo } } +NamesAndTypesList getCommonVirtualsForFileLikeStorage() +{ + return {{"_path", std::make_shared(std::make_shared())}, + {"_file", std::make_shared(std::make_shared())}, + {"_size", makeNullable(std::make_shared())}, + {"_time", makeNullable(std::make_shared())}, + {"_etag", std::make_shared(std::make_shared())}}; +} + NameSet getVirtualNamesForFileLikeStorage() { - return {"_path", "_file", "_size", "_time", "_etag"}; + return getCommonVirtualsForFileLikeStorage().getNameSet(); } std::unordered_map parseHivePartitioningKeysAndValues(const String & path) @@ -155,8 +165,10 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto { VirtualColumnsDescription desc; - auto add_virtual = [&](const auto & name, const auto & type) + auto add_virtual = [&](const NameAndTypePair & pair) { + const auto & name = pair.getNameInStorage(); + const auto & type = pair.getTypeInStorage(); if (storage_columns.has(name)) { if (!context->getSettingsRef().use_hive_partitioning) @@ -173,11 +185,8 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto desc.addEphemeral(name, type, ""); }; - add_virtual("_path", std::make_shared(std::make_shared())); - add_virtual("_file", std::make_shared(std::make_shared())); - add_virtual("_size", makeNullable(std::make_shared())); - add_virtual("_time", makeNullable(std::make_shared())); - add_virtual("_etag", std::make_shared(std::make_shared())); + for (const auto & item : getCommonVirtualsForFileLikeStorage()) + add_virtual(item); if (context->getSettingsRef().use_hive_partitioning) { @@ -189,9 +198,9 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto if (type == nullptr) type = std::make_shared(); if (type->canBeInsideLowCardinality()) - add_virtual(item.first, std::make_shared(type)); + add_virtual({item.first, std::make_shared(type)}); else - add_virtual(item.first, type); + add_virtual({item.first, type}); } } @@ -228,18 +237,18 @@ static void addFilterDataToVirtualColumns(Block & block, const String & path, si block.getByName("_idx").column->assumeMutableRef().insert(idx); } -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const String & path, const ContextPtr & context) +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { if (!predicate || virtual_columns.empty()) return {}; Block block; - std::unordered_map keys; + NameSet common_virtuals; if (context->getSettingsRef().use_hive_partitioning) - keys = parseHivePartitioningKeysAndValues(path); + common_virtuals = getVirtualNamesForFileLikeStorage(); for (const auto & column : virtual_columns) { - if (column.name == "_file" || column.name == "_path" || keys.contains(column.name)) + if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name)) block.insert({column.type->createColumn(), column.type, column.name}); } @@ -262,10 +271,10 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const auto it = keys.find(column.name); if (it != keys.end()) { - auto c = std::make_shared()->createColumn(); - c->insert(it->second); + auto string_column = std::make_shared()->createColumn(); + string_column->insert(it->second); block.insert({column.type->createColumn(), column.type, column.name}); - partitioning_columns.push_back({c->getPtr(), column.type, column.name}); + partitioning_columns.push_back({string_column->getPtr(), column.type, column.name}); } } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); diff --git a/src/Storages/VirtualColumnUtils.h b/src/Storages/VirtualColumnUtils.h index ecfe44a1956..a9c46569a83 100644 --- a/src/Storages/VirtualColumnUtils.h +++ b/src/Storages/VirtualColumnUtils.h @@ -75,7 +75,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage( const std::string & sample_path = "", std::optional format_settings_ = std::nullopt); -std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const String & path, const ContextPtr & context); +std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context); ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context); From d6b2a9d5343f77738f4f4f911acf2397a3841f8c Mon Sep 17 00:00:00 2001 From: yariks5s Date: Wed, 28 Aug 2024 22:32:44 +0000 Subject: [PATCH 08/33] CLICKHOUSE_LOCAL -> CLIENT --- .../03231_hive_partitioning_filtering.sh | 23 +++++++++++-------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index e545f53f257..d32d5596110 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -4,26 +4,31 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_LOCAL --query_id="test_03231_1" --query " - SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; +DATA_DIR=$USER_FILES_PATH/$CLICKHOUSE_TEST_UNIQUE_NAME +mkdir -p $DATA_DIR +cp -r $CURDIR/data_hive/ $DATA_DIR + +$CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " + SELECT countDistinct(_path) FROM file('$DATA_DIR/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; " -$CLICKHOUSE_LOCAL --query " +$CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase(); " -$CLICKHOUSE_LOCAL --query_id="test_03231_2" --query " - SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; +$CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " + SELECT countDistinct(_path) FROM file('$DATA_DIR/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; " -$CLICKHOUSE_LOCAL --query " +$CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase(); " -$CLICKHOUSE_LOCAL --query_id="test_03231_3" --query " - SELECT countDistinct(_path) FROM file('$CURDIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; +$CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " + SELECT countDistinct(_path) FROM file('$DATA_DIR/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; " -$CLICKHOUSE_LOCAL --query " +$CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' AND current_database = currentDatabase(); " +rm -rf $DATA_DIR From edc5d8dd92f18ec9636a910676e7b8f0334ca805 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Wed, 28 Aug 2024 23:15:01 +0000 Subject: [PATCH 09/33] fix path --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index d32d5596110..52067299b0c 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -9,7 +9,7 @@ mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR $CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; " $CLICKHOUSE_CLIENT --query " @@ -17,7 +17,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; " $CLICKHOUSE_CLIENT --query " @@ -25,7 +25,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; " $CLICKHOUSE_CLIENT --query " From afc4d08aadd8a5a1ad3b66bf2faada143100b8b6 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 29 Aug 2024 13:31:05 +0200 Subject: [PATCH 10/33] add no-fasttest tag --- tests/queries/0_stateless/03231_hive_partitioning_filtering.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 52067299b0c..8bd5f10bc4e 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 2adc61c21503e3c55e09c1ecd42833f6508d8584 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:39:22 +0200 Subject: [PATCH 11/33] add flush logs --- .../0_stateless/03231_hive_partitioning_filtering.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 8bd5f10bc4e..00763faafb8 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -9,24 +9,27 @@ DATA_DIR=$USER_FILES_PATH/$CLICKHOUSE_TEST_UNIQUE_NAME mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR -$CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " +$CLICKHOUSE_CLIENT --query_id="test_03231_1" --query -nm " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; + SYSTEM FLUSH LOGS; " $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase(); " -$CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " +$CLICKHOUSE_CLIENT --query_id="test_03231_2" --query -nm " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; + SYSTEM FLUSH LOGS; " $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase(); " -$CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " +$CLICKHOUSE_CLIENT --query_id="test_03231_3" --query -nm " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; + SYSTEM FLUSH LOGS; " $CLICKHOUSE_CLIENT --query " From 7a879980d8e2deace80b084e8281488995ca23c4 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Thu, 29 Aug 2024 18:25:11 +0200 Subject: [PATCH 12/33] try to fix tests --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 00763faafb8..1bfb5101ef3 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -9,7 +9,7 @@ DATA_DIR=$USER_FILES_PATH/$CLICKHOUSE_TEST_UNIQUE_NAME mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR -$CLICKHOUSE_CLIENT --query_id="test_03231_1" --query -nm " +$CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -18,7 +18,7 @@ $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase(); " -$CLICKHOUSE_CLIENT --query_id="test_03231_2" --query -nm " +$CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -27,7 +27,7 @@ $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase(); " -$CLICKHOUSE_CLIENT --query_id="test_03231_3" --query -nm " +$CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " From ec469a117d9cd808a0a4b89d8e5ce22c38b8fe2a Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 30 Aug 2024 00:56:35 +0200 Subject: [PATCH 13/33] testing --- .../0_stateless/03203_hive_style_partitioning.reference | 4 ++-- .../0_stateless/03231_hive_partitioning_filtering.sh | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/queries/0_stateless/03203_hive_style_partitioning.reference b/tests/queries/0_stateless/03203_hive_style_partitioning.reference index 0fbc1fb556e..bb6a345c6ec 100644 --- a/tests/queries/0_stateless/03203_hive_style_partitioning.reference +++ b/tests/queries/0_stateless/03203_hive_style_partitioning.reference @@ -33,8 +33,8 @@ Cross Elizabeth [1,2,3] 42.42 Array(Int64) LowCardinality(Float64) 101 -2070 -2070 +2071 +2071 b 1 1 diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 1bfb5101ef3..b6a62d3bc33 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest +# Tags: no-fasttest, no-parallel CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -10,7 +10,7 @@ mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR $CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; + SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -19,7 +19,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; + SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -28,7 +28,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; + SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " From 620640a0423155800a6b3a80f15e67eba7614ecb Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 30 Aug 2024 12:58:21 +0200 Subject: [PATCH 14/33] just to test --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index b6a62d3bc33..d24c4e94c08 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -10,7 +10,7 @@ mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR $CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " - SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; + SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 1 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -19,7 +19,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " - SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; + SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 LIMIT 1 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -28,7 +28,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " - SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; + SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] LIMIT 1 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " From 24f4e87f8bf6fbd424895ab75ca71583c18ce320 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:20:22 +0200 Subject: [PATCH 15/33] revert debugging in tests --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index d24c4e94c08..5c63370f38e 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -10,7 +10,7 @@ mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR $CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " - SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' LIMIT 1 SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -19,7 +19,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " - SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 LIMIT 1 SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " @@ -28,7 +28,7 @@ $CLICKHOUSE_CLIENT --query " " $CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " - SELECT _path FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] LIMIT 1 SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " From 21f9669836fc39052c98ea8aa54f5d82888af100 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:41:43 +0200 Subject: [PATCH 16/33] empty commit From f688b903dbc83c36a1d1526df4d50519790f0eb9 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 3 Sep 2024 15:58:22 +0200 Subject: [PATCH 17/33] empty commit From 8896d1b78b9a96cc6f5b8349eb869d8e1c8807f0 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:46:29 +0200 Subject: [PATCH 18/33] try to fix tests --- .../03231_hive_partitioning_filtering.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 5c63370f38e..64e971a4891 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags: no-fasttest, no-parallel +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -9,30 +9,30 @@ DATA_DIR=$USER_FILES_PATH/$CLICKHOUSE_TEST_UNIQUE_NAME mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR -$CLICKHOUSE_CLIENT --query_id="test_03231_1" --query " +$CLICKHOUSE_CLIENT --query_id="test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " $CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase(); + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase() and type='QueryFinish'; " -$CLICKHOUSE_CLIENT --query_id="test_03231_2" --query " +$CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " $CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase(); + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase() and type='QueryFinish'; " -$CLICKHOUSE_CLIENT --query_id="test_03231_3" --query " +$CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; SYSTEM FLUSH LOGS; " $CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' AND current_database = currentDatabase(); + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' AND current_database = currentDatabase() and type='QueryFinish'; " rm -rf $DATA_DIR From 2fa6be55ff373b1cf847396ae80cbc8c40db5b4b Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 4 Sep 2024 17:02:01 +0200 Subject: [PATCH 19/33] tests fix --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 64e971a4891..a561758c726 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -15,7 +15,7 @@ $CLICKHOUSE_CLIENT --query_id="test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME" --quer " $CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1' AND current_database = currentDatabase() and type='QueryFinish'; + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME' AND current_database = currentDatabase() and type='QueryFinish'; " $CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " @@ -24,7 +24,7 @@ $CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --quer " $CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2' AND current_database = currentDatabase() and type='QueryFinish'; + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME' AND current_database = currentDatabase() and type='QueryFinish'; " $CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " @@ -33,6 +33,6 @@ $CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --quer " $CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3' AND current_database = currentDatabase() and type='QueryFinish'; + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME' AND current_database = currentDatabase() and type='QueryFinish'; " rm -rf $DATA_DIR From a903e1a726eb864049804fc906195aa669fa9ece Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 6 Sep 2024 20:24:18 +0200 Subject: [PATCH 20/33] remove logging + fixing bug --- src/Storages/StorageFile.cpp | 7 ---- src/Storages/VirtualColumnUtils.cpp | 34 ++++++++----------- .../03231_hive_partitioning_filtering.sh | 9 +++++ 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 655cc064fea..55bc8083ec8 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -356,18 +356,14 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user { fs::path user_files_absolute_path = fs::weakly_canonical(user_files_path); fs::path fs_table_path(table_path); - LOG_TRACE(getLogger("testing the paths"), "{} , {}", user_files_absolute_path, fs_table_path); if (fs_table_path.is_relative()) fs_table_path = user_files_absolute_path / fs_table_path; - LOG_TRACE(getLogger("testing the paths"), "fs_table_path = {}", fs_table_path); - Strings paths; /// Do not use fs::canonical or fs::weakly_canonical. /// Otherwise it will not allow to work with symlinks in `user_files_path` directory. String path = fs::absolute(fs_table_path).lexically_normal(); /// Normalize path. - LOG_TRACE(getLogger("testing the paths"), "path = {}", path); bool can_be_directory = true; if (path.find(PartitionedSink::PARTITION_ID_WILDCARD) != std::string::npos) @@ -400,10 +396,7 @@ Strings StorageFile::getPathsList(const String & table_path, const String & user } for (const auto & cur_path : paths) - { checkCreationIsAllowed(context, user_files_absolute_path, cur_path, can_be_directory); - LOG_TRACE(getLogger("checking all paths"), "{}", cur_path); - } return paths; } diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 2bd7325a789..523f236bf59 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -207,7 +207,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto return desc; } -static void addFilterDataToVirtualColumns(Block & block, const String & path, size_t idx, ColumnsWithTypeAndName partitioning_keys, const ContextPtr & context) +static void addFilterDataToVirtualColumns(Block & block, const String & path, size_t idx, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -224,13 +224,20 @@ static void addFilterDataToVirtualColumns(Block & block, const String & path, si block.getByName("_file").column->assumeMutableRef().insert(file); } - for (const auto & item : partitioning_keys) + std::unordered_map keys; + if (context->getSettingsRef().use_hive_partitioning) + keys = parseHivePartitioningKeysAndValues(path); + + for (const auto & virt_column : virtual_columns) { - if (block.has(item.name)) + auto it = keys.find(virt_column.name); + if (it != keys.end()) { - auto column = block.getByName(item.name).column; - ReadBufferFromString buf(item.column->getDataAt(0).toView()); - item.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); + if (!block.has(virt_column.name)) + block.insert({virt_column.type->createColumn(), virt_column.type, virt_column.name}); + auto & column = block.getByName(virt_column.name).column; + ReadBufferFromString buf(it->second); + virt_column.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); } } @@ -259,28 +266,15 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; - std::unordered_map keys; - ColumnsWithTypeAndName partitioning_columns; - if (context->getSettingsRef().use_hive_partitioning) - keys = parseHivePartitioningKeysAndValues(paths[0]); for (const auto & column : virtual_columns) { if (column.name == "_file" || column.name == "_path") block.insert({column.type->createColumn(), column.type, column.name}); - - auto it = keys.find(column.name); - if (it != keys.end()) - { - auto string_column = std::make_shared()->createColumn(); - string_column->insert(it->second); - block.insert({column.type->createColumn(), column.type, column.name}); - partitioning_columns.push_back({string_column->getPtr(), column.type, column.name}); - } } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); for (size_t i = 0; i != paths.size(); ++i) - addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns, context); + addFilterDataToVirtualColumns(block, paths[i], i, virtual_columns, context); filterBlockWithExpression(actions, block); diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index a561758c726..b66d2971cac 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -11,6 +11,9 @@ cp -r $CURDIR/data_hive/ $DATA_DIR $CLICKHOUSE_CLIENT --query_id="test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; +" + +$CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " @@ -20,6 +23,9 @@ $CLICKHOUSE_CLIENT --query " $CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; +" + +$CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " @@ -29,6 +35,9 @@ $CLICKHOUSE_CLIENT --query " $CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; +" + +$CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " From 2876a4e7146d53a032d74fd8bf54c9abf1dc988a Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 11 Sep 2024 13:32:12 +0200 Subject: [PATCH 21/33] add retries --- .../03231_hive_partitioning_filtering.sh | 43 +++++++++++++++---- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index b66d2971cac..13bbfac349d 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -17,9 +17,17 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " -$CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME' AND current_database = currentDatabase() and type='QueryFinish'; -" +for i in {1..5}; do + count=$( $CLICKHOUSE_CLIENT --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log + WHERE query_id='test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME' AND + current_database = currentDatabase() and type='QueryFinish';" ) + if [[ "$count" == "1" ]]; then + echo "1" + break + fi + sleep 1 +done $CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; @@ -29,9 +37,17 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " -$CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME' AND current_database = currentDatabase() and type='QueryFinish'; -" +for i in {1..5}; do + count=$( $CLICKHOUSE_CLIENT --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log + WHERE query_id='test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME' AND + current_database = currentDatabase() and type='QueryFinish';" ) + if [[ "$count" == "1" ]]; then + echo "1" + break + fi + sleep 1 +done $CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; @@ -41,7 +57,16 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " -$CLICKHOUSE_CLIENT --query " - SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME' AND current_database = currentDatabase() and type='QueryFinish'; -" +for i in {1..5}; do + count=$( $CLICKHOUSE_CLIENT --query " + SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log + WHERE query_id='test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME' AND + current_database = currentDatabase() and type='QueryFinish';" ) + if [[ "$count" == "1" ]]; then + echo "1" + break + fi + sleep 1 +done + rm -rf $DATA_DIR From e8cec05d08dd53051d767d7725404abf777f6c45 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Wed, 11 Sep 2024 13:52:20 +0200 Subject: [PATCH 22/33] shellcheck --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 13bbfac349d..41f11ff869c 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -17,7 +17,7 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " -for i in {1..5}; do +for _ in {1..5}; do count=$( $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME' AND @@ -37,7 +37,7 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " -for i in {1..5}; do +for _ in {1..5}; do count=$( $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME' AND @@ -57,7 +57,7 @@ $CLICKHOUSE_CLIENT --query " SYSTEM FLUSH LOGS; " -for i in {1..5}; do +for _ in {1..5}; do count=$( $CLICKHOUSE_CLIENT --query " SELECT ProfileEvents['EngineFileLikeReadFiles'] FROM system.query_log WHERE query_id='test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME' AND From 14a6b0422b6b7abb68c2dc8a55fb566bdafafe25 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 13 Sep 2024 16:33:17 +0200 Subject: [PATCH 23/33] disable optimize_count_from_files --- .../0_stateless/03231_hive_partitioning_filtering.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh index 41f11ff869c..30ae5b01a98 100755 --- a/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh +++ b/tests/queries/0_stateless/03231_hive_partitioning_filtering.sh @@ -10,7 +10,7 @@ mkdir -p $DATA_DIR cp -r $CURDIR/data_hive/ $DATA_DIR $CLICKHOUSE_CLIENT --query_id="test_03231_1_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/column0=*/sample.parquet') WHERE column0 = 'Elizabeth' SETTINGS use_hive_partitioning=1, optimize_count_from_files=0; " $CLICKHOUSE_CLIENT --query " @@ -30,7 +30,7 @@ for _ in {1..5}; do done $CLICKHOUSE_CLIENT --query_id="test_03231_2_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/identifier=*/email.csv') WHERE identifier = 2070 SETTINGS use_hive_partitioning=1, optimize_count_from_files=0; " $CLICKHOUSE_CLIENT --query " @@ -50,7 +50,7 @@ for _ in {1..5}; do done $CLICKHOUSE_CLIENT --query_id="test_03231_3_$CLICKHOUSE_TEST_UNIQUE_NAME" --query " - SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1; + SELECT countDistinct(_path) FROM file('$DATA_DIR/data_hive/partitioning/array=*/sample.parquet') WHERE array = [1,2,3] SETTINGS use_hive_partitioning=1, optimize_count_from_files=0; " $CLICKHOUSE_CLIENT --query " From c184aae686dc0918ec24f774c6d125e5b639e20d Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 13 Sep 2024 16:40:01 +0200 Subject: [PATCH 24/33] review --- src/Storages/VirtualColumnUtils.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 523f236bf59..8d46a49bb73 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -233,8 +233,6 @@ static void addFilterDataToVirtualColumns(Block & block, const String & path, si auto it = keys.find(virt_column.name); if (it != keys.end()) { - if (!block.has(virt_column.name)) - block.insert({virt_column.type->createColumn(), virt_column.type, virt_column.name}); auto & column = block.getByName(virt_column.name).column; ReadBufferFromString buf(it->second); virt_column.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); From 991279e5c626067f9f371da5115e6993ed665ee6 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 13 Sep 2024 19:23:00 +0200 Subject: [PATCH 25/33] revert --- src/Storages/VirtualColumnUtils.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 8d46a49bb73..523f236bf59 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -233,6 +233,8 @@ static void addFilterDataToVirtualColumns(Block & block, const String & path, si auto it = keys.find(virt_column.name); if (it != keys.end()) { + if (!block.has(virt_column.name)) + block.insert({virt_column.type->createColumn(), virt_column.type, virt_column.name}); auto & column = block.getByName(virt_column.name).column; ReadBufferFromString buf(it->second); virt_column.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); From ad31d86a15f865cca2b18d6240cfbf17adc26435 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 13 Sep 2024 19:58:19 +0200 Subject: [PATCH 26/33] move the block inserting --- src/Storages/VirtualColumnUtils.cpp | 44 ++++++++++++++++++----------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 523f236bf59..2daffb43c84 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -207,7 +207,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto return desc; } -static void addFilterDataToVirtualColumns(Block & block, const String & path, size_t idx, const NamesAndTypesList & virtual_columns, const ContextPtr & context) +static void addFilterDataToVirtualColumns(Block & block, const String & path, size_t idx, ColumnWithTypeAndName partitioning_column, const ContextPtr & context) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -224,21 +224,11 @@ static void addFilterDataToVirtualColumns(Block & block, const String & path, si block.getByName("_file").column->assumeMutableRef().insert(file); } - std::unordered_map keys; - if (context->getSettingsRef().use_hive_partitioning) - keys = parseHivePartitioningKeysAndValues(path); - - for (const auto & virt_column : virtual_columns) + if (block.has(partitioning_column.name)) { - auto it = keys.find(virt_column.name); - if (it != keys.end()) - { - if (!block.has(virt_column.name)) - block.insert({virt_column.type->createColumn(), virt_column.type, virt_column.name}); - auto & column = block.getByName(virt_column.name).column; - ReadBufferFromString buf(it->second); - virt_column.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); - } + auto & column = block.getByName(partitioning_column.name).column; + ReadBufferFromString buf(partitioning_column.column->getDataAt(0).toView()); + partitioning_column.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); } block.getByName("_idx").column->assumeMutableRef().insert(idx); @@ -266,15 +256,37 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; + std::vector> keys_vec; + ColumnsWithTypeAndName partitioning_columns; + if (context->getSettingsRef().use_hive_partitioning) + { + for (const auto & path : paths) + keys_vec.push_back(parseHivePartitioningKeysAndValues(path)); + } for (const auto & column : virtual_columns) { if (column.name == "_file" || column.name == "_path") block.insert({column.type->createColumn(), column.type, column.name}); + else + { + for (auto & keys : keys_vec) + { + const auto & it = keys.find(column.name); + if (it != keys.end()) + { + auto string_column = std::make_shared()->createColumn(); + string_column->insert(it->second); + block.insert({column.type->createColumn(), column.type, column.name}); + partitioning_columns.push_back({string_column->getPtr(), column.type, column.name}); + keys.erase(it); + } + } + } } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); for (size_t i = 0; i != paths.size(); ++i) - addFilterDataToVirtualColumns(block, paths[i], i, virtual_columns, context); + addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns[i], context); filterBlockWithExpression(actions, block); From 0d1d750437f12ce6ab5dcec828250e387c389bc8 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 13 Sep 2024 20:43:51 +0200 Subject: [PATCH 27/33] fix crash --- src/Storages/VirtualColumnUtils.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 2daffb43c84..667a7e2506f 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -285,7 +285,7 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); - for (size_t i = 0; i != paths.size(); ++i) + for (size_t i = 0; i != partitioning_columns.size(); ++i) addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns[i], context); filterBlockWithExpression(actions, block); From 7d5203f8a7d43fef85f35868616a67070bc7899a Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Fri, 13 Sep 2024 21:38:48 +0200 Subject: [PATCH 28/33] add resize for partitioning_columns --- src/Storages/VirtualColumnUtils.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 667a7e2506f..b5deab95f3f 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -285,6 +285,7 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); + partitioning_columns.resize(paths.size()); for (size_t i = 0; i != partitioning_columns.size(); ++i) addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns[i], context); From 04f23332c3d11a6098d69114a882f51cd5f13e9a Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 16 Sep 2024 15:59:22 +0200 Subject: [PATCH 29/33] fix filter issue --- src/Storages/VirtualColumnUtils.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index b5deab95f3f..4e163be9128 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -276,7 +276,8 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const { auto string_column = std::make_shared()->createColumn(); string_column->insert(it->second); - block.insert({column.type->createColumn(), column.type, column.name}); + if (!block.has(column.name)) + block.insert({column.type->createColumn(), column.type, column.name}); partitioning_columns.push_back({string_column->getPtr(), column.type, column.name}); keys.erase(it); } @@ -286,7 +287,7 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); partitioning_columns.resize(paths.size()); - for (size_t i = 0; i != partitioning_columns.size(); ++i) + for (size_t i = 0; i != paths.size(); ++i) addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns[i], context); filterBlockWithExpression(actions, block); From 0cdec0acf10bba22b2cbe3eee7e26d7365739587 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Mon, 16 Sep 2024 19:13:30 +0200 Subject: [PATCH 30/33] fix logical error --- src/Storages/VirtualColumnUtils.cpp | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 4e163be9128..70133cf02ca 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -207,7 +207,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto return desc; } -static void addFilterDataToVirtualColumns(Block & block, const String & path, size_t idx, ColumnWithTypeAndName partitioning_column, const ContextPtr & context) +static void addFilterDataToDefaultColumns(Block & block, const String & path, size_t idx) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -224,16 +224,22 @@ static void addFilterDataToVirtualColumns(Block & block, const String & path, si block.getByName("_file").column->assumeMutableRef().insert(file); } - if (block.has(partitioning_column.name)) - { - auto & column = block.getByName(partitioning_column.name).column; - ReadBufferFromString buf(partitioning_column.column->getDataAt(0).toView()); - partitioning_column.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); - } - block.getByName("_idx").column->assumeMutableRef().insert(idx); } +static void addFilterDataToPartitioningColumns(Block & block, ColumnsWithTypeAndName partitioning_keys, const ContextPtr & context) +{ + for (const auto & item : partitioning_keys) + { + if (block.has(item.name)) + { + auto & column = block.getByName(item.name).column; + ReadBufferFromString buf(item.column->getDataAt(0).toView()); + item.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); + } + } +} + std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { if (!predicate || virtual_columns.empty()) @@ -286,9 +292,11 @@ ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); - partitioning_columns.resize(paths.size()); for (size_t i = 0; i != paths.size(); ++i) - addFilterDataToVirtualColumns(block, paths[i], i, partitioning_columns[i], context); + addFilterDataToDefaultColumns(block, paths[i], i); + + if (context->getSettingsRef().use_hive_partitioning) + addFilterDataToPartitioningColumns(block, partitioning_columns, context); filterBlockWithExpression(actions, block); From cb92aaf968537cc767a87f3a4e139cf5957a298c Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 17 Sep 2024 11:26:13 +0200 Subject: [PATCH 31/33] fix 03232_file_path_normalizing --- tests/queries/0_stateless/03232_file_path_normalizing.reference | 2 +- tests/queries/0_stateless/03232_file_path_normalizing.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/03232_file_path_normalizing.reference b/tests/queries/0_stateless/03232_file_path_normalizing.reference index 953db2c5dfe..3b41cf34056 100644 --- a/tests/queries/0_stateless/03232_file_path_normalizing.reference +++ b/tests/queries/0_stateless/03232_file_path_normalizing.reference @@ -1 +1 @@ -data_hive/partitioning/column0=Elizabeth/sample.parquet +data_hive/partitioning/non_existing_column=Elizabeth/sample.parquet diff --git a/tests/queries/0_stateless/03232_file_path_normalizing.sh b/tests/queries/0_stateless/03232_file_path_normalizing.sh index e7a7a65be51..add6049f9b5 100755 --- a/tests/queries/0_stateless/03232_file_path_normalizing.sh +++ b/tests/queries/0_stateless/03232_file_path_normalizing.sh @@ -5,4 +5,4 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -$CLICKHOUSE_LOCAL -q "SELECT substring(_path, position(_path, 'data_hive')) FROM file('$CURDIR/data_hive/partitioning/column0=*/sample.parquet') LIMIT 1;" +$CLICKHOUSE_LOCAL -q "SELECT substring(_path, position(_path, 'data_hive')) FROM file('$CURDIR/data_hive/partitioning/non_existing_column=*/sample.parquet') LIMIT 1;" From e8d50aa97ff10fc9a853a4689c9346baf23ecd82 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:02:33 +0200 Subject: [PATCH 32/33] review --- src/Storages/VirtualColumnUtils.cpp | 53 ++++++++--------------------- 1 file changed, 15 insertions(+), 38 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 70133cf02ca..155420434fa 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -207,7 +207,7 @@ VirtualColumnsDescription getVirtualsForFileLikeStorage(ColumnsDescription & sto return desc; } -static void addFilterDataToDefaultColumns(Block & block, const String & path, size_t idx) +static void addPathAndFileToVirtualColumns(Block & block, const String & path, size_t idx, const FormatSettings & format_settings, bool use_hive_partitioning) { if (block.has("_path")) block.getByName("_path").column->assumeMutableRef().insert(path); @@ -224,20 +224,20 @@ static void addFilterDataToDefaultColumns(Block & block, const String & path, si block.getByName("_file").column->assumeMutableRef().insert(file); } - block.getByName("_idx").column->assumeMutableRef().insert(idx); -} - -static void addFilterDataToPartitioningColumns(Block & block, ColumnsWithTypeAndName partitioning_keys, const ContextPtr & context) -{ - for (const auto & item : partitioning_keys) + if (use_hive_partitioning) { - if (block.has(item.name)) + auto keys_and_values = parseHivePartitioningKeysAndValues(path); + for (const auto & [key, value] : keys_and_values) { - auto & column = block.getByName(item.name).column; - ReadBufferFromString buf(item.column->getDataAt(0).toView()); - item.type->getDefaultSerialization()->deserializeWholeText(column->assumeMutableRef(), buf, getFormatSettings(context)); + if (const auto * column = block.findByName(key)) + { + ReadBufferFromString buf(value); + column->type->getDefaultSerialization()->deserializeWholeText(column->column->assumeMutableRef(), buf, format_settings); + } } } + + block.getByName("_idx").column->assumeMutableRef().insert(idx); } std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, const ContextPtr & context) @@ -262,41 +262,18 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; - std::vector> keys_vec; - ColumnsWithTypeAndName partitioning_columns; + NameSet common_virtuals; if (context->getSettingsRef().use_hive_partitioning) - { - for (const auto & path : paths) - keys_vec.push_back(parseHivePartitioningKeysAndValues(path)); - } + common_virtuals = getVirtualNamesForFileLikeStorage(); for (const auto & column : virtual_columns) { - if (column.name == "_file" || column.name == "_path") + if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name)) block.insert({column.type->createColumn(), column.type, column.name}); - else - { - for (auto & keys : keys_vec) - { - const auto & it = keys.find(column.name); - if (it != keys.end()) - { - auto string_column = std::make_shared()->createColumn(); - string_column->insert(it->second); - if (!block.has(column.name)) - block.insert({column.type->createColumn(), column.type, column.name}); - partitioning_columns.push_back({string_column->getPtr(), column.type, column.name}); - keys.erase(it); - } - } - } } block.insert({ColumnUInt64::create(), std::make_shared(), "_idx"}); for (size_t i = 0; i != paths.size(); ++i) - addFilterDataToDefaultColumns(block, paths[i], i); - - if (context->getSettingsRef().use_hive_partitioning) - addFilterDataToPartitioningColumns(block, partitioning_columns, context); + addPathAndFileToVirtualColumns(block, paths[i], i, getFormatSettings(context), context->getSettingsRef().use_hive_partitioning); filterBlockWithExpression(actions, block); From 3a7c68a052381b2218017b73f40a92effcd9cb93 Mon Sep 17 00:00:00 2001 From: Yarik Briukhovetskyi <114298166+yariks5s@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:39:26 +0200 Subject: [PATCH 33/33] Update src/Storages/VirtualColumnUtils.cpp Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Storages/VirtualColumnUtils.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/Storages/VirtualColumnUtils.cpp b/src/Storages/VirtualColumnUtils.cpp index 155420434fa..14bf8ac8c13 100644 --- a/src/Storages/VirtualColumnUtils.cpp +++ b/src/Storages/VirtualColumnUtils.cpp @@ -262,9 +262,7 @@ std::optional createPathAndFileFilterDAG(const ActionsDAG::Node * pr ColumnPtr getFilterByPathAndFileIndexes(const std::vector & paths, const ExpressionActionsPtr & actions, const NamesAndTypesList & virtual_columns, const ContextPtr & context) { Block block; - NameSet common_virtuals; - if (context->getSettingsRef().use_hive_partitioning) - common_virtuals = getVirtualNamesForFileLikeStorage(); + NameSet common_virtuals = getVirtualNamesForFileLikeStorage(); for (const auto & column : virtual_columns) { if (column.name == "_file" || column.name == "_path" || !common_virtuals.contains(column.name))