From 26c9042ea0f0529f464435cbeef111f3e6d396a5 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 12 Jun 2023 17:06:52 +0000 Subject: [PATCH 001/884] Analyzer: support aliases in StorageMerge --- src/Analyzer/IQueryTreePass.h | 2 +- ...egateFunctionsArithmericOperationsPass.cpp | 2 +- ...gregateFunctionsArithmericOperationsPass.h | 2 +- src/Analyzer/Passes/ArrayExistsToHasPass.cpp | 2 +- src/Analyzer/Passes/ArrayExistsToHasPass.h | 2 +- src/Analyzer/Passes/AutoFinalOnQueryPass.cpp | 2 +- src/Analyzer/Passes/AutoFinalOnQueryPass.h | 2 +- .../Passes/ComparisonTupleEliminationPass.cpp | 2 +- .../Passes/ComparisonTupleEliminationPass.h | 2 +- .../Passes/ConvertOrLikeChainPass.cpp | 2 +- src/Analyzer/Passes/ConvertOrLikeChainPass.h | 2 +- src/Analyzer/Passes/ConvertQueryToCNFPass.cpp | 2 +- src/Analyzer/Passes/ConvertQueryToCNFPass.h | 2 +- src/Analyzer/Passes/CountDistinctPass.cpp | 2 +- src/Analyzer/Passes/CountDistinctPass.h | 2 +- src/Analyzer/Passes/CrossToInnerJoinPass.cpp | 2 +- src/Analyzer/Passes/CrossToInnerJoinPass.h | 2 +- .../Passes/FunctionToSubcolumnsPass.cpp | 2 +- .../Passes/FunctionToSubcolumnsPass.h | 2 +- src/Analyzer/Passes/FuseFunctionsPass.cpp | 2 +- src/Analyzer/Passes/FuseFunctionsPass.h | 2 +- .../Passes/GroupingFunctionsResolvePass.cpp | 2 +- .../Passes/GroupingFunctionsResolvePass.h | 2 +- src/Analyzer/Passes/IfChainToMultiIfPass.cpp | 2 +- src/Analyzer/Passes/IfChainToMultiIfPass.h | 2 +- .../Passes/IfConstantConditionPass.cpp | 2 +- src/Analyzer/Passes/IfConstantConditionPass.h | 2 +- .../Passes/IfTransformStringsToEnumPass.cpp | 2 +- .../Passes/IfTransformStringsToEnumPass.h | 2 +- .../Passes/LogicalExpressionOptimizerPass.cpp | 2 +- .../Passes/LogicalExpressionOptimizerPass.h | 2 +- src/Analyzer/Passes/MultiIfToIfPass.cpp | 2 +- src/Analyzer/Passes/MultiIfToIfPass.h | 2 +- .../Passes/NormalizeCountVariantsPass.cpp | 2 +- .../Passes/NormalizeCountVariantsPass.h | 2 +- .../OptimizeGroupByFunctionKeysPass.cpp | 2 +- .../Passes/OptimizeGroupByFunctionKeysPass.h | 2 +- ...ptimizeRedundantFunctionsInOrderByPass.cpp | 2 +- .../OptimizeRedundantFunctionsInOrderByPass.h | 2 +- ...OrderByLimitByDuplicateEliminationPass.cpp | 2 +- .../OrderByLimitByDuplicateEliminationPass.h | 2 +- .../Passes/OrderByTupleEliminationPass.cpp | 2 +- .../Passes/OrderByTupleEliminationPass.h | 2 +- src/Analyzer/Passes/QueryAnalysisPass.cpp | 15 ++- src/Analyzer/Passes/QueryAnalysisPass.h | 2 +- .../RewriteAggregateFunctionWithIfPass.cpp | 2 +- .../RewriteAggregateFunctionWithIfPass.h | 2 +- .../Passes/ShardNumColumnToFunctionPass.cpp | 2 +- .../Passes/ShardNumColumnToFunctionPass.h | 2 +- src/Analyzer/Passes/SumIfToCountIfPass.cpp | 2 +- src/Analyzer/Passes/SumIfToCountIfPass.h | 2 +- .../UniqInjectiveFunctionsEliminationPass.cpp | 2 +- .../UniqInjectiveFunctionsEliminationPass.h | 2 +- src/Planner/PlannerActionsVisitor.cpp | 4 +- src/Storages/StorageDistributed.cpp | 4 +- src/Storages/StorageMerge.cpp | 121 +++++++++++++++--- src/Storages/StorageMerge.h | 9 +- 57 files changed, 177 insertions(+), 80 deletions(-) diff --git a/src/Analyzer/IQueryTreePass.h b/src/Analyzer/IQueryTreePass.h index 4293934c32d..d4499c3271c 100644 --- a/src/Analyzer/IQueryTreePass.h +++ b/src/Analyzer/IQueryTreePass.h @@ -31,7 +31,7 @@ public: virtual String getDescription() = 0; /// Run pass over query tree - virtual void run(QueryTreeNodePtr query_tree_node, ContextPtr context) = 0; + virtual void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) = 0; }; diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp index 1476a66c892..2a69292ff78 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.cpp @@ -201,7 +201,7 @@ private: } -void AggregateFunctionsArithmericOperationsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void AggregateFunctionsArithmericOperationsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { AggregateFunctionsArithmericOperationsVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h index a89d2f87ad9..d510b62f9be 100644 --- a/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h +++ b/src/Analyzer/Passes/AggregateFunctionsArithmericOperationsPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Extract arithmeric operations from aggregate functions."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp index c0f958588f1..63d417cd570 100644 --- a/src/Analyzer/Passes/ArrayExistsToHasPass.cpp +++ b/src/Analyzer/Passes/ArrayExistsToHasPass.cpp @@ -92,7 +92,7 @@ public: } -void RewriteArrayExistsToHasPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void RewriteArrayExistsToHasPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { RewriteArrayExistsToHasVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/ArrayExistsToHasPass.h b/src/Analyzer/Passes/ArrayExistsToHasPass.h index 8f4623116e3..4795b61c625 100644 --- a/src/Analyzer/Passes/ArrayExistsToHasPass.h +++ b/src/Analyzer/Passes/ArrayExistsToHasPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Rewrite arrayExists(func, arr) functions to has(arr, elem) when logically equivalent"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp index 15326ca1dc8..ee9e1023949 100644 --- a/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp +++ b/src/Analyzer/Passes/AutoFinalOnQueryPass.cpp @@ -67,7 +67,7 @@ private: } -void AutoFinalOnQueryPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void AutoFinalOnQueryPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto visitor = AutoFinalOnQueryPassVisitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/AutoFinalOnQueryPass.h b/src/Analyzer/Passes/AutoFinalOnQueryPass.h index 3489597108c..d595b98d349 100644 --- a/src/Analyzer/Passes/AutoFinalOnQueryPass.h +++ b/src/Analyzer/Passes/AutoFinalOnQueryPass.h @@ -25,7 +25,7 @@ public: return "Automatically applies final modifier to table expressions in queries if it is supported and if user level final setting is set"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp b/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp index 4e0562a2fe8..57920065513 100644 --- a/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp +++ b/src/Analyzer/Passes/ComparisonTupleEliminationPass.cpp @@ -201,7 +201,7 @@ private: } -void ComparisonTupleEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ComparisonTupleEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { ComparisonTupleEliminationPassVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/ComparisonTupleEliminationPass.h b/src/Analyzer/Passes/ComparisonTupleEliminationPass.h index 954a9d6a2f0..7f4245e2d95 100644 --- a/src/Analyzer/Passes/ComparisonTupleEliminationPass.h +++ b/src/Analyzer/Passes/ComparisonTupleEliminationPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Rewrite tuples comparison into equivalent comparison of tuples arguments"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp index 7d7362fb742..0d2ddd20374 100644 --- a/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp +++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.cpp @@ -132,7 +132,7 @@ private: } -void ConvertOrLikeChainPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ConvertOrLikeChainPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto or_function_resolver = FunctionFactory::instance().get("or", context); auto match_function_resolver = FunctionFactory::instance().get("multiMatchAny", context); diff --git a/src/Analyzer/Passes/ConvertOrLikeChainPass.h b/src/Analyzer/Passes/ConvertOrLikeChainPass.h index 0f734bfa73d..90bccaa0e8d 100644 --- a/src/Analyzer/Passes/ConvertOrLikeChainPass.h +++ b/src/Analyzer/Passes/ConvertOrLikeChainPass.h @@ -14,7 +14,7 @@ public: String getDescription() override { return "Replaces all the 'or's with {i}like to multiMatchAny"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp index 4d32c96b845..ecba2e28749 100644 --- a/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp +++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.cpp @@ -720,7 +720,7 @@ public: } -void ConvertLogicalExpressionToCNFPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ConvertLogicalExpressionToCNFPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { const auto & settings = context->getSettingsRef(); if (!settings.convert_query_to_cnf) diff --git a/src/Analyzer/Passes/ConvertQueryToCNFPass.h b/src/Analyzer/Passes/ConvertQueryToCNFPass.h index 5ed874db006..60943c04d78 100644 --- a/src/Analyzer/Passes/ConvertQueryToCNFPass.h +++ b/src/Analyzer/Passes/ConvertQueryToCNFPass.h @@ -12,7 +12,7 @@ public: String getDescription() override { return "Convert logical expression to CNF and apply optimizations using constraints"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/CountDistinctPass.cpp b/src/Analyzer/Passes/CountDistinctPass.cpp index 945295f5cbc..eb2859020be 100644 --- a/src/Analyzer/Passes/CountDistinctPass.cpp +++ b/src/Analyzer/Passes/CountDistinctPass.cpp @@ -84,7 +84,7 @@ public: } -void CountDistinctPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void CountDistinctPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { CountDistinctVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/CountDistinctPass.h b/src/Analyzer/Passes/CountDistinctPass.h index cac5033c98f..33728b0228c 100644 --- a/src/Analyzer/Passes/CountDistinctPass.h +++ b/src/Analyzer/Passes/CountDistinctPass.h @@ -20,7 +20,7 @@ public: return "Optimize single countDistinct into count over subquery"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp index d4877d23f28..3283c163890 100644 --- a/src/Analyzer/Passes/CrossToInnerJoinPass.cpp +++ b/src/Analyzer/Passes/CrossToInnerJoinPass.cpp @@ -264,7 +264,7 @@ private: } -void CrossToInnerJoinPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void CrossToInnerJoinPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { CrossToInnerJoinVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/CrossToInnerJoinPass.h b/src/Analyzer/Passes/CrossToInnerJoinPass.h index 127d26dc41d..b0437c562ac 100644 --- a/src/Analyzer/Passes/CrossToInnerJoinPass.h +++ b/src/Analyzer/Passes/CrossToInnerJoinPass.h @@ -22,7 +22,7 @@ public: return "Replace CROSS JOIN with INNER JOIN"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp index 696483862e0..1b04136e6a4 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.cpp @@ -202,7 +202,7 @@ private: } -void FunctionToSubcolumnsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void FunctionToSubcolumnsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { FunctionToSubcolumnsVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/FunctionToSubcolumnsPass.h b/src/Analyzer/Passes/FunctionToSubcolumnsPass.h index 0e1d2583e7b..d4edcc5b922 100644 --- a/src/Analyzer/Passes/FunctionToSubcolumnsPass.h +++ b/src/Analyzer/Passes/FunctionToSubcolumnsPass.h @@ -24,7 +24,7 @@ public: String getDescription() override { return "Rewrite function to subcolumns, for example tupleElement(column, subcolumn) into column.subcolumn"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/FuseFunctionsPass.cpp b/src/Analyzer/Passes/FuseFunctionsPass.cpp index 14082697955..ef87528964c 100644 --- a/src/Analyzer/Passes/FuseFunctionsPass.cpp +++ b/src/Analyzer/Passes/FuseFunctionsPass.cpp @@ -254,7 +254,7 @@ void tryFuseQuantiles(QueryTreeNodePtr query_tree_node, ContextPtr context) } -void FuseFunctionsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void FuseFunctionsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { tryFuseSumCountAvg(query_tree_node, context); tryFuseQuantiles(query_tree_node, context); diff --git a/src/Analyzer/Passes/FuseFunctionsPass.h b/src/Analyzer/Passes/FuseFunctionsPass.h index a92b77b1115..2fd85da4747 100644 --- a/src/Analyzer/Passes/FuseFunctionsPass.h +++ b/src/Analyzer/Passes/FuseFunctionsPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Replaces several calls of aggregate functions of the same family into one call"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp b/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp index 0cf5310a3ad..774014e5ffd 100644 --- a/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp +++ b/src/Analyzer/Passes/GroupingFunctionsResolvePass.cpp @@ -248,7 +248,7 @@ private: } -void GroupingFunctionsResolvePass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void GroupingFunctionsResolvePass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { GroupingFunctionsResolveVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/GroupingFunctionsResolvePass.h b/src/Analyzer/Passes/GroupingFunctionsResolvePass.h index 070c8dd9389..cd932f76977 100644 --- a/src/Analyzer/Passes/GroupingFunctionsResolvePass.h +++ b/src/Analyzer/Passes/GroupingFunctionsResolvePass.h @@ -24,7 +24,7 @@ public: String getDescription() override { return "Resolve GROUPING functions based on GROUP BY modifiers"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp index 1f97e012331..91a5709f142 100644 --- a/src/Analyzer/Passes/IfChainToMultiIfPass.cpp +++ b/src/Analyzer/Passes/IfChainToMultiIfPass.cpp @@ -73,7 +73,7 @@ private: } -void IfChainToMultiIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void IfChainToMultiIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto multi_if_function_ptr = FunctionFactory::instance().get("multiIf", context); IfChainToMultiIfPassVisitor visitor(std::move(multi_if_function_ptr), std::move(context)); diff --git a/src/Analyzer/Passes/IfChainToMultiIfPass.h b/src/Analyzer/Passes/IfChainToMultiIfPass.h index 43f3fb8831d..9e7335d93e4 100644 --- a/src/Analyzer/Passes/IfChainToMultiIfPass.h +++ b/src/Analyzer/Passes/IfChainToMultiIfPass.h @@ -18,7 +18,7 @@ public: String getDescription() override { return "Optimize if chain to multiIf"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/IfConstantConditionPass.cpp b/src/Analyzer/Passes/IfConstantConditionPass.cpp index 6f9cfe482f1..35c6718f018 100644 --- a/src/Analyzer/Passes/IfConstantConditionPass.cpp +++ b/src/Analyzer/Passes/IfConstantConditionPass.cpp @@ -49,7 +49,7 @@ public: } -void IfConstantConditionPass::run(QueryTreeNodePtr query_tree_node, ContextPtr) +void IfConstantConditionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr) { IfConstantConditionVisitor visitor; visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/IfConstantConditionPass.h b/src/Analyzer/Passes/IfConstantConditionPass.h index 7817e67aa5e..7548fc702bc 100644 --- a/src/Analyzer/Passes/IfConstantConditionPass.h +++ b/src/Analyzer/Passes/IfConstantConditionPass.h @@ -21,7 +21,7 @@ public: String getDescription() override { return "Optimize if, multiIf for constant condition."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp b/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp index 562aff4cf05..32e3c3cda51 100644 --- a/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp +++ b/src/Analyzer/Passes/IfTransformStringsToEnumPass.cpp @@ -205,7 +205,7 @@ public: } -void IfTransformStringsToEnumPass::run(QueryTreeNodePtr query, ContextPtr context) +void IfTransformStringsToEnumPass::run(QueryTreeNodePtr & query, ContextPtr context) { ConvertStringsToEnumVisitor visitor(std::move(context)); visitor.visit(query); diff --git a/src/Analyzer/Passes/IfTransformStringsToEnumPass.h b/src/Analyzer/Passes/IfTransformStringsToEnumPass.h index a4a014967e0..522087aafae 100644 --- a/src/Analyzer/Passes/IfTransformStringsToEnumPass.h +++ b/src/Analyzer/Passes/IfTransformStringsToEnumPass.h @@ -33,7 +33,7 @@ public: String getDescription() override { return "Replaces string-type arguments in If and Transform to enum"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp index 13f8025f5ea..7e0b6b2f828 100644 --- a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp +++ b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.cpp @@ -233,7 +233,7 @@ private: } }; -void LogicalExpressionOptimizerPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void LogicalExpressionOptimizerPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { LogicalExpressionOptimizerVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h index 05c10ddc685..51d9968b48c 100644 --- a/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h +++ b/src/Analyzer/Passes/LogicalExpressionOptimizerPass.h @@ -76,7 +76,7 @@ public: String getDescription() override { return "Transform equality chain to a single IN function or a constant if possible"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/MultiIfToIfPass.cpp b/src/Analyzer/Passes/MultiIfToIfPass.cpp index 4672351bcfb..5012aa7fa78 100644 --- a/src/Analyzer/Passes/MultiIfToIfPass.cpp +++ b/src/Analyzer/Passes/MultiIfToIfPass.cpp @@ -43,7 +43,7 @@ private: } -void MultiIfToIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void MultiIfToIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { auto if_function_ptr = FunctionFactory::instance().get("if", context); MultiIfToIfVisitor visitor(std::move(if_function_ptr), std::move(context)); diff --git a/src/Analyzer/Passes/MultiIfToIfPass.h b/src/Analyzer/Passes/MultiIfToIfPass.h index 2213f3713ed..e3c03913aaa 100644 --- a/src/Analyzer/Passes/MultiIfToIfPass.h +++ b/src/Analyzer/Passes/MultiIfToIfPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Optimize multiIf with single condition to if."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp index d36be98751c..20b308c3af6 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp @@ -64,7 +64,7 @@ private: } -void NormalizeCountVariantsPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void NormalizeCountVariantsPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { NormalizeCountVariantsVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/NormalizeCountVariantsPass.h b/src/Analyzer/Passes/NormalizeCountVariantsPass.h index 78a114f4a85..6cf9f34619a 100644 --- a/src/Analyzer/Passes/NormalizeCountVariantsPass.h +++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Optimize count(literal), sum(1) into count()."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp index 5ed52f1210b..7c851d5fc35 100644 --- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp +++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.cpp @@ -130,7 +130,7 @@ private: } }; -void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void OptimizeGroupByFunctionKeysPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { OptimizeGroupByFunctionKeysVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h index 632960c45bb..fd5eadcb796 100644 --- a/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h +++ b/src/Analyzer/Passes/OptimizeGroupByFunctionKeysPass.h @@ -16,7 +16,7 @@ public: String getDescription() override { return "Eliminates functions of other keys in GROUP BY section."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp index c6d312d0ecf..b6cc50caffe 100644 --- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp +++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.cpp @@ -124,7 +124,7 @@ private: } -void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void OptimizeRedundantFunctionsInOrderByPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { OptimizeRedundantFunctionsInOrderByVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h index 609a6360d27..4a63c78022b 100644 --- a/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h +++ b/src/Analyzer/Passes/OptimizeRedundantFunctionsInOrderByPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "If ORDER BY has argument x followed by f(x) transforms it to ORDER BY x."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; } diff --git a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp index 3632c41028b..26ca5984b49 100644 --- a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp +++ b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.cpp @@ -70,7 +70,7 @@ private: } -void OrderByLimitByDuplicateEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr) +void OrderByLimitByDuplicateEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr) { OrderByLimitByDuplicateEliminationVisitor visitor; visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h index 11a025af5b9..de5e1898a4c 100644 --- a/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h +++ b/src/Analyzer/Passes/OrderByLimitByDuplicateEliminationPass.h @@ -20,7 +20,7 @@ public: String getDescription() override { return "Remove duplicate columns from ORDER BY, LIMIT BY."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp b/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp index f70ec27ba5d..7c106082124 100644 --- a/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp +++ b/src/Analyzer/Passes/OrderByTupleEliminationPass.cpp @@ -50,7 +50,7 @@ public: } -void OrderByTupleEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr) +void OrderByTupleEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr) { OrderByTupleEliminationVisitor visitor; visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/OrderByTupleEliminationPass.h b/src/Analyzer/Passes/OrderByTupleEliminationPass.h index 5665561e227..45c8a756795 100644 --- a/src/Analyzer/Passes/OrderByTupleEliminationPass.h +++ b/src/Analyzer/Passes/OrderByTupleEliminationPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Remove tuple from ORDER BY."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index c454ad9f84f..1a76bc762a4 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -77,6 +77,8 @@ #include #include #include +#include +#include namespace ProfileEvents { @@ -1056,7 +1058,7 @@ private: class QueryAnalyzer { public: - void resolve(QueryTreeNodePtr node, const QueryTreeNodePtr & table_expression, ContextPtr context) + void resolve(QueryTreeNodePtr & node, const QueryTreeNodePtr & table_expression, ContextPtr context) { IdentifierResolveScope scope(node, nullptr /*parent_scope*/); @@ -1097,6 +1099,7 @@ public: { if (table_expression) { + LOG_DEBUG(&Poco::Logger::get("resolve"), "Table expression: {}", table_expression->dumpTree()); scope.expression_join_tree_node = table_expression; validateTableExpressionModifiers(scope.expression_join_tree_node, scope); initializeTableExpressionData(scope.expression_join_tree_node, scope); @@ -1106,6 +1109,7 @@ public: resolveExpressionNodeList(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); else resolveExpressionNode(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); + LOG_DEBUG(&Poco::Logger::get("resolve"), "Result: {}", node->dumpTree()); break; } @@ -2677,6 +2681,7 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier */ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableColumns(const IdentifierLookup & identifier_lookup, IdentifierResolveScope & scope) { + LOG_DEBUG(&Poco::Logger::get("tryResolveIdentifierFromTableColumns"), "{} {}", scope.column_name_to_column_node.size(), !identifier_lookup.isExpressionLookup()); if (scope.column_name_to_column_node.empty() || !identifier_lookup.isExpressionLookup()) return {}; @@ -2836,11 +2841,14 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableExpression(const Id QueryTreeNodePtr result_expression; bool match_full_identifier = false; + LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Looking for id: {}", identifier_without_column_qualifier.getFullName()); + auto it = table_expression_data.column_name_to_column_node.find(identifier_without_column_qualifier.getFullName()); if (it != table_expression_data.column_name_to_column_node.end()) { match_full_identifier = true; result_expression = it->second; + LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Found: {}", result_expression->dumpTree()); } else { @@ -5389,6 +5397,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id auto unresolved_identifier = identifier_node.getIdentifier(); auto resolve_identifier_expression_result = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::EXPRESSION}, scope); auto resolved_identifier_node = resolve_identifier_expression_result.resolved_identifier; + LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Resolved: {}", resolved_identifier_node ? resolved_identifier_node->dumpTree() : "Not resolved"); if (resolved_identifier_node && result_projection_names.empty() && (resolve_identifier_expression_result.isResolvedFromJoinTree() || resolve_identifier_expression_result.isResolvedFromExpressionArguments())) @@ -5470,6 +5479,7 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id } node = std::move(resolved_identifier_node); + LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Result node: {}", node ? node->dumpTree() : "Not resolved"); if (node->getNodeType() == QueryTreeNodeType::LIST) { @@ -6173,6 +6183,7 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table table_expression_data.should_qualify_columns = false; } + LOG_DEBUG(&Poco::Logger::get("Analyzer"), "Table data: {}", table_expression_data.dump()); scope.table_expression_node_to_data.emplace(table_expression_node, std::move(table_expression_data)); } @@ -7152,7 +7163,7 @@ QueryAnalysisPass::QueryAnalysisPass(QueryTreeNodePtr table_expression_) : table_expression(std::move(table_expression_)) {} -void QueryAnalysisPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void QueryAnalysisPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { QueryAnalyzer analyzer; analyzer.resolve(query_tree_node, table_expression, context); diff --git a/src/Analyzer/Passes/QueryAnalysisPass.h b/src/Analyzer/Passes/QueryAnalysisPass.h index fa8778ebf76..5d335d3e712 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.h +++ b/src/Analyzer/Passes/QueryAnalysisPass.h @@ -89,7 +89,7 @@ public: return "Resolve type for each query expression. Replace identifiers, matchers with query expressions. Perform constant folding. Evaluate scalar subqueries."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; private: QueryTreeNodePtr table_expression; diff --git a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp index de264948d4c..2fe5a89578b 100644 --- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp +++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.cpp @@ -108,7 +108,7 @@ private: } -void RewriteAggregateFunctionWithIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void RewriteAggregateFunctionWithIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { RewriteAggregateFunctionWithIfVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h index be8ad3ac34d..0a2fc1ba423 100644 --- a/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h +++ b/src/Analyzer/Passes/RewriteAggregateFunctionWithIfPass.h @@ -20,7 +20,7 @@ public: return "Rewrite aggregate functions with if expression as argument when logically equivalent"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp index b28816e8ff3..c273aecc9b5 100644 --- a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp +++ b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.cpp @@ -58,7 +58,7 @@ public: } -void ShardNumColumnToFunctionPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void ShardNumColumnToFunctionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { ShardNumColumnToFunctionVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h index 71a038bcf39..248f4e29bbe 100644 --- a/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h +++ b/src/Analyzer/Passes/ShardNumColumnToFunctionPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Rewrite _shard_num column into shardNum() function"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/SumIfToCountIfPass.cpp b/src/Analyzer/Passes/SumIfToCountIfPass.cpp index d55af278152..04d6c134d10 100644 --- a/src/Analyzer/Passes/SumIfToCountIfPass.cpp +++ b/src/Analyzer/Passes/SumIfToCountIfPass.cpp @@ -180,7 +180,7 @@ private: } -void SumIfToCountIfPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void SumIfToCountIfPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { SumIfToCountIfVisitor visitor(context); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/SumIfToCountIfPass.h b/src/Analyzer/Passes/SumIfToCountIfPass.h index f3ba47f1c2c..439d80c6306 100644 --- a/src/Analyzer/Passes/SumIfToCountIfPass.h +++ b/src/Analyzer/Passes/SumIfToCountIfPass.h @@ -23,7 +23,7 @@ public: String getDescription() override { return "Rewrite sum(if) and sumIf into countIf"; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp index 5c4484457e8..e256934010d 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.cpp @@ -87,7 +87,7 @@ public: } -void UniqInjectiveFunctionsEliminationPass::run(QueryTreeNodePtr query_tree_node, ContextPtr context) +void UniqInjectiveFunctionsEliminationPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { UniqInjectiveFunctionsEliminationVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h index a0f07dfb7b5..c143fe2c39c 100644 --- a/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h +++ b/src/Analyzer/Passes/UniqInjectiveFunctionsEliminationPass.h @@ -17,7 +17,7 @@ public: String getDescription() override { return "Remove injective functions from uniq functions arguments."; } - void run(QueryTreeNodePtr query_tree_node, ContextPtr context) override; + void run(QueryTreeNodePtr & query_tree_node, ContextPtr context) override; }; diff --git a/src/Planner/PlannerActionsVisitor.cpp b/src/Planner/PlannerActionsVisitor.cpp index c64d82299ca..e9fa72f925d 100644 --- a/src/Planner/PlannerActionsVisitor.cpp +++ b/src/Planner/PlannerActionsVisitor.cpp @@ -494,8 +494,8 @@ PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::vi return visitFunction(node); throw Exception(ErrorCodes::UNSUPPORTED_METHOD, - "Expected column, constant, function. Actual {}", - node->formatASTForErrorMessage()); + "Expected column, constant, function. Actual {} with type: {}", + node->formatASTForErrorMessage(), node_type); } PlannerActionsVisitorImpl::NodeNameAndNodeMinLevel PlannerActionsVisitorImpl::visitColumn(const QueryTreeNodePtr & node) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index b91ad0b963a..9f9f0fda9e2 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,6 +30,7 @@ #include #include #include +#include "Analyzer/IQueryTreeNode.h" #include #include @@ -937,7 +938,8 @@ QueryTreeNodePtr buildQueryTreeDistributed(SelectQueryInfo & query_info, table_function_node->setTableExpressionModifiers(*table_expression_modifiers); QueryAnalysisPass query_analysis_pass; - query_analysis_pass.run(table_function_node, query_context); + QueryTreeNodePtr node = table_function_node; + query_analysis_pass.run(node, query_context); replacement_table_expression = std::move(table_function_node); } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index b0ed242d14d..a49155ac2d9 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -27,9 +27,18 @@ #include #include #include +#include "Common/logger_useful.h" #include #include +#include "Analyzer/ColumnNode.h" +#include "Analyzer/IQueryTreeNode.h" +#include "Analyzer/Identifier.h" +#include "Analyzer/IdentifierNode.h" +#include "Analyzer/Passes/QueryAnalysisPass.h" +#include "Analyzer/QueryTreeBuilder.h" +#include "Core/NamesAndTypes.h" #include "DataTypes/IDataType.h" +#include "Planner/PlannerActionsVisitor.h" #include #include #include @@ -42,6 +51,7 @@ #include #include #include +#include namespace @@ -464,8 +474,8 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu auto storage_metadata_snapshot = storage->getInMemoryMetadataPtr(); auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); - auto modified_query_info = getModifiedQueryInfo(query_info, context, table, nested_storage_snaphsot); Names column_names_as_aliases; + auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, column_names_as_aliases); if (!context->getSettingsRef().allow_experimental_analyzer) { @@ -553,10 +563,10 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu pipeline.addResources(std::move(resources)); } -SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, +SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot) + const StorageSnapshotPtr & storage_snapshot, + Names & column_names_as_aliases) const { const auto & [database_name, storage, storage_lock, table_name] = storage_with_lock_and_name; const StorageID current_storage_id = storage->getStorageID(); @@ -586,6 +596,47 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); + auto storage_columns = storage_snapshot->metadata->getColumns(); + + bool with_aliases = /* common_processed_stage == QueryProcessingStage::FetchColumns && */ !storage_columns.getAliases().empty(); + if (with_aliases) + { + auto filter_actions_dag = std::make_shared(); + for (const auto & column : column_names) + { + const auto column_default = storage_columns.getDefault(column); + bool is_alias = column_default && column_default->kind == ColumnDefaultKind::Alias; + + QueryTreeNodePtr column_node; + + if (is_alias) + { + column_node = buildQueryTree(column_default->expression, modified_context); + + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", column_node->dumpTree(), modified_query_info.table_expression->dumpTree()); + + column_node->setAlias(column); + + QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); + query_analysis_pass.run(column_node, modified_context); + + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); + + column_name_to_node.emplace(column, column_node); + } + else + { + column_node = std::make_shared(NameAndTypePair{column, storage_columns.getColumn(get_column_options, column).type }, modified_query_info.table_expression); + } + + + PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); + actions_visitor.visit(filter_actions_dag, column_node); + } + column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Required names: {}", toString(column_names_as_aliases)); + } + if (!column_name_to_node.empty()) { replaceColumns(modified_query_info.query_tree, @@ -594,6 +645,7 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const SelectQueryInfo & quer } modified_query_info.query = queryNodeToSelectQuery(modified_query_info.query_tree); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Modified query: {}", modified_query_info.query->formatForLogging()); } else { @@ -640,6 +692,8 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( modified_select.setFinal(); } + LOG_DEBUG(&Poco::Logger::get("createSources"), "real_column_names: {}", toString(real_column_names)); + bool allow_experimental_analyzer = modified_context->getSettingsRef().allow_experimental_analyzer; auto storage_stage = storage->getQueryProcessingStage(modified_context, @@ -783,7 +837,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); + convertingSourceStream(header, modified_query_info, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); } return builder; @@ -957,9 +1011,10 @@ void StorageMerge::alter( void ReadFromMerge::convertingSourceStream( const Block & header, + SelectQueryInfo & modified_query_info, const StorageMetadataPtr & metadata_snapshot, const Aliases & aliases, - ContextPtr local_context, + ContextMutablePtr local_context, QueryPipelineBuilder & builder, const QueryProcessingStage::Enum & processed_stage) { @@ -968,21 +1023,49 @@ void ReadFromMerge::convertingSourceStream( auto storage_sample_block = metadata_snapshot->getSampleBlock(); auto pipe_columns = builder.getHeader().getNamesAndTypesList(); - for (const auto & alias : aliases) + if (local_context->getSettingsRef().allow_experimental_analyzer) { - pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); - ASTPtr expr = alias.expression; - auto syntax_result = TreeRewriter(local_context).analyze(expr, pipe_columns); - auto expression_analyzer = ExpressionAnalyzer{alias.expression, syntax_result, local_context}; - - auto dag = std::make_shared(pipe_columns); - auto actions_dag = expression_analyzer.getActionsDAG(true, false); - auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - - builder.addSimpleTransform([&](const Block & stream_header) + for (const auto & alias : aliases) { - return std::make_shared(stream_header, actions); - }); + pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); + + auto actions_dag = std::make_shared(); + + QueryTreeNodePtr query_tree = buildQueryTree(alias.expression, local_context); + query_tree->setAlias(alias.name); + + QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); + query_analysis_pass.run(query_tree, local_context); + + PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); + actions_visitor.visit(actions_dag, query_tree); + + auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + + builder.addSimpleTransform([&](const Block & stream_header) + { + return std::make_shared(stream_header, actions); + }); + } + } + else + { + for (const auto & alias : aliases) + { + pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); + ASTPtr expr = alias.expression; + auto syntax_result = TreeRewriter(local_context).analyze(expr, pipe_columns); + auto expression_analyzer = ExpressionAnalyzer{alias.expression, syntax_result, local_context}; + + auto dag = std::make_shared(pipe_columns); + auto actions_dag = expression_analyzer.getActionsDAG(true, false); + auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + + builder.addSimpleTransform([&](const Block & stream_header) + { + return std::make_shared(stream_header, actions); + }); + } } ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index babf0dd92e8..739d6831f6f 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -177,10 +177,10 @@ private: using Aliases = std::vector; - static SelectQueryInfo getModifiedQueryInfo(const SelectQueryInfo & query_info, - const ContextPtr & modified_context, + SelectQueryInfo getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, - const StorageSnapshotPtr & storage_snapshot); + const StorageSnapshotPtr & storage_snapshot, + Names & column_names_as_aliases) const; QueryPipelineBuilderPtr createSources( const StorageSnapshotPtr & storage_snapshot, @@ -197,9 +197,10 @@ private: static void convertingSourceStream( const Block & header, + SelectQueryInfo & modified_query_info, const StorageMetadataPtr & metadata_snapshot, const Aliases & aliases, - ContextPtr context, + ContextMutablePtr context, QueryPipelineBuilder & builder, const QueryProcessingStage::Enum & processed_stage); }; From fc9ee3eb4e1e4c4b145bc39bc7ce507cf05b9d1d Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 13 Jun 2023 15:01:31 +0000 Subject: [PATCH 002/884] Correctly build the ActionsDAG --- src/Storages/StorageMerge.cpp | 28 +++++++++++++++++++++------- src/Storages/StorageMerge.h | 3 ++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index a49155ac2d9..d036eaa9f25 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -475,7 +475,7 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu auto nested_storage_snaphsot = storage->getStorageSnapshot(storage_metadata_snapshot, context); Names column_names_as_aliases; - auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, column_names_as_aliases); + auto modified_query_info = getModifiedQueryInfo(context, table, nested_storage_snaphsot, column_names_as_aliases, aliases); if (!context->getSettingsRef().allow_experimental_analyzer) { @@ -566,7 +566,8 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, const StorageSnapshotPtr & storage_snapshot, - Names & column_names_as_aliases) const + Names & column_names_as_aliases, + Aliases & aliases) const { const auto & [database_name, storage, storage_lock, table_name] = storage_with_lock_and_name; const StorageID current_storage_id = storage->getStorageID(); @@ -611,18 +612,23 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ if (is_alias) { - column_node = buildQueryTree(column_default->expression, modified_context); + // column_node = buildQueryTree(column_default->expression, modified_context); + column_node = std::make_shared(Identifier{column}); LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", column_node->dumpTree(), modified_query_info.table_expression->dumpTree()); - column_node->setAlias(column); - QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); query_analysis_pass.run(column_node, modified_context); LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); + auto * resolved_column = column_node->as(); + if (!resolved_column || !resolved_column->getExpression()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); + + column_node = resolved_column->getExpression(); column_name_to_node.emplace(column, column_node); + aliases.push_back({ .name = column, .type = resolved_column->getResultType(), .expression = column_node->toAST() }); } else { @@ -634,6 +640,9 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ actions_visitor.visit(filter_actions_dag, column_node); } column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); + if (column_names_as_aliases.empty()) + column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Required names: {}", toString(column_names_as_aliases)); } @@ -1029,7 +1038,7 @@ void ReadFromMerge::convertingSourceStream( { pipe_columns.emplace_back(NameAndTypePair(alias.name, alias.type)); - auto actions_dag = std::make_shared(); + auto actions_dag = std::make_shared(pipe_columns); QueryTreeNodePtr query_tree = buildQueryTree(alias.expression, local_context); query_tree->setAlias(alias.name); @@ -1038,7 +1047,12 @@ void ReadFromMerge::convertingSourceStream( query_analysis_pass.run(query_tree, local_context); PlannerActionsVisitor actions_visitor(modified_query_info.planner_context, false /*use_column_identifier_as_action_node_name*/); - actions_visitor.visit(actions_dag, query_tree); + const auto & nodes = actions_visitor.visit(actions_dag, query_tree); + + if (nodes.size() != 1) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected to have 1 output but got {}", nodes.size()); + + actions_dag->addOrReplaceInOutputs(actions_dag->addAlias(*nodes.front(), alias.name)); auto actions = std::make_shared(actions_dag, ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 739d6831f6f..987869e5de3 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -180,7 +180,8 @@ private: SelectQueryInfo getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, const StorageSnapshotPtr & storage_snapshot, - Names & column_names_as_aliases) const; + Names & column_names_as_aliases, + Aliases & aliases) const; QueryPipelineBuilderPtr createSources( const StorageSnapshotPtr & storage_snapshot, From 55b81a5a5e7ad73a3e53aee0d0b83731ff8e76ed Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 13 Jun 2023 23:13:18 +0000 Subject: [PATCH 003/884] Fix style --- src/Storages/StorageMerge.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index d036eaa9f25..e2a27d4e20e 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -80,6 +80,7 @@ namespace DB namespace ErrorCodes { + extern const int LOGICAL_ERROR; extern const int BAD_ARGUMENTS; extern const int NOT_IMPLEMENTED; extern const int ILLEGAL_PREWHERE; From 6489922dc19a0fda86bdcc8e08c108812dc4aebf Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Fri, 16 Jun 2023 18:49:59 +0000 Subject: [PATCH 004/884] Fix for column aliases that use other aliases --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 9 ------ src/Storages/StorageMerge.cpp | 38 ++++++++++++++++++++--- 2 files changed, 33 insertions(+), 14 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 1a76bc762a4..309f067c4c0 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -1099,7 +1099,6 @@ public: { if (table_expression) { - LOG_DEBUG(&Poco::Logger::get("resolve"), "Table expression: {}", table_expression->dumpTree()); scope.expression_join_tree_node = table_expression; validateTableExpressionModifiers(scope.expression_join_tree_node, scope); initializeTableExpressionData(scope.expression_join_tree_node, scope); @@ -1109,7 +1108,6 @@ public: resolveExpressionNodeList(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); else resolveExpressionNode(node, scope, false /*allow_lambda_expression*/, false /*allow_table_expression*/); - LOG_DEBUG(&Poco::Logger::get("resolve"), "Result: {}", node->dumpTree()); break; } @@ -2681,7 +2679,6 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromAliases(const Identifier */ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableColumns(const IdentifierLookup & identifier_lookup, IdentifierResolveScope & scope) { - LOG_DEBUG(&Poco::Logger::get("tryResolveIdentifierFromTableColumns"), "{} {}", scope.column_name_to_column_node.size(), !identifier_lookup.isExpressionLookup()); if (scope.column_name_to_column_node.empty() || !identifier_lookup.isExpressionLookup()) return {}; @@ -2841,14 +2838,11 @@ QueryTreeNodePtr QueryAnalyzer::tryResolveIdentifierFromTableExpression(const Id QueryTreeNodePtr result_expression; bool match_full_identifier = false; - LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Looking for id: {}", identifier_without_column_qualifier.getFullName()); - auto it = table_expression_data.column_name_to_column_node.find(identifier_without_column_qualifier.getFullName()); if (it != table_expression_data.column_name_to_column_node.end()) { match_full_identifier = true; result_expression = it->second; - LOG_DEBUG(&Poco::Logger::get("resolve_identifier_from_storage_or_throw"), "Found: {}", result_expression->dumpTree()); } else { @@ -5397,7 +5391,6 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id auto unresolved_identifier = identifier_node.getIdentifier(); auto resolve_identifier_expression_result = tryResolveIdentifier({unresolved_identifier, IdentifierLookupContext::EXPRESSION}, scope); auto resolved_identifier_node = resolve_identifier_expression_result.resolved_identifier; - LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Resolved: {}", resolved_identifier_node ? resolved_identifier_node->dumpTree() : "Not resolved"); if (resolved_identifier_node && result_projection_names.empty() && (resolve_identifier_expression_result.isResolvedFromJoinTree() || resolve_identifier_expression_result.isResolvedFromExpressionArguments())) @@ -5479,7 +5472,6 @@ ProjectionNames QueryAnalyzer::resolveExpressionNode(QueryTreeNodePtr & node, Id } node = std::move(resolved_identifier_node); - LOG_DEBUG(&Poco::Logger::get("resolveExpressionNode"), "Result node: {}", node ? node->dumpTree() : "Not resolved"); if (node->getNodeType() == QueryTreeNodeType::LIST) { @@ -6183,7 +6175,6 @@ void QueryAnalyzer::initializeTableExpressionData(const QueryTreeNodePtr & table table_expression_data.should_qualify_columns = false; } - LOG_DEBUG(&Poco::Logger::get("Analyzer"), "Table data: {}", table_expression_data.dump()); scope.table_expression_node_to_data.emplace(table_expression_node, std::move(table_expression_data)); } diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index e2a27d4e20e..13548a84826 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -34,6 +34,7 @@ #include "Analyzer/IQueryTreeNode.h" #include "Analyzer/Identifier.h" #include "Analyzer/IdentifierNode.h" +#include "Analyzer/InDepthQueryTreeVisitor.h" #include "Analyzer/Passes/QueryAnalysisPass.h" #include "Analyzer/QueryTreeBuilder.h" #include "Core/NamesAndTypes.h" @@ -564,6 +565,26 @@ void ReadFromMerge::initializePipeline(QueryPipelineBuilder & pipeline, const Bu pipeline.addResources(std::move(resources)); } +namespace +{ + +class ApplyAliasColumnExpressionsVisitor : public InDepthQueryTreeVisitor +{ +public: + ApplyAliasColumnExpressionsVisitor() = default; + + void visitImpl(QueryTreeNodePtr & node) + { + if (auto * column = node->as(); + column != nullptr && column->hasExpression()) + { + node = column->getExpressionOrThrow(); + } + } +}; + +} + SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, const StorageWithLockAndName & storage_with_lock_and_name, const StorageSnapshotPtr & storage_snapshot, @@ -611,23 +632,28 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ QueryTreeNodePtr column_node; + if (is_alias) { // column_node = buildQueryTree(column_default->expression, modified_context); - column_node = std::make_shared(Identifier{column}); + QueryTreeNodePtr fake_node = std::make_shared(Identifier{column}); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", column_node->dumpTree(), modified_query_info.table_expression->dumpTree()); + LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", fake_node->dumpTree(), modified_query_info.table_expression->dumpTree()); QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); - query_analysis_pass.run(column_node, modified_context); + query_analysis_pass.run(fake_node, modified_context); + + auto * resolved_column = fake_node->as(); + + column_node = fake_node; + ApplyAliasColumnExpressionsVisitor visitor; + visitor.visit(column_node); LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); - auto * resolved_column = column_node->as(); if (!resolved_column || !resolved_column->getExpression()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); - column_node = resolved_column->getExpression(); column_name_to_node.emplace(column, column_node); aliases.push_back({ .name = column, .type = resolved_column->getResultType(), .expression = column_node->toAST() }); } @@ -1095,6 +1121,8 @@ void ReadFromMerge::convertingSourceStream( std::move(convert_actions_dag), ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); + LOG_DEBUG(&Poco::Logger::get("convertingSourceStream"), "The header: {}", builder.getHeader().dumpStructure()); + builder.addSimpleTransform([&](const Block & stream_header) { return std::make_shared(stream_header, actions); From f9e67fe0427ee2d698d2b946a8286e228d47b0ec Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 19 Jun 2023 15:10:29 +0000 Subject: [PATCH 005/884] Update broken_tests.txt --- tests/broken_tests.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index d49b4f391e5..1635c8740cc 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -24,7 +24,6 @@ 01173_transaction_control_queries 01211_optimize_skip_unused_shards_type_mismatch 01213_optimize_skip_unused_shards_DISTINCT -01214_test_storage_merge_aliases_with_where 01231_distributed_aggregation_memory_efficient_mix_levels 01244_optimize_distributed_group_by_sharding_key 01247_optimize_distributed_group_by_sharding_key_dist_on_dist @@ -68,7 +67,6 @@ 01890_materialized_distributed_join 01901_in_literal_shard_prune 01925_join_materialized_columns -01925_test_storage_merge_aliases 01930_optimize_skip_unused_shards_rewrite_in 01947_mv_subquery 01951_distributed_push_down_limit From dcdadd5f639def096bd330f987609d0c5740ca83 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 19 Jun 2023 15:18:04 +0000 Subject: [PATCH 006/884] Update broken_tests.txt --- tests/broken_tests.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index 1635c8740cc..8b11c5f5413 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -99,7 +99,6 @@ 02494_optimize_group_by_function_keys_and_alias_columns 02521_aggregation_by_partitions 02554_fix_grouping_sets_predicate_push_down -02575_merge_prewhere_different_default_kind 02713_array_low_cardinality_string 02707_skip_index_with_in 02241_join_rocksdb_bs From 20c752fb787a05f9180f791401afe56bf372acfc Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Mon, 19 Jun 2023 15:44:01 +0000 Subject: [PATCH 007/884] Fix generated query --- src/Storages/StorageMerge.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 13548a84826..22308c1d901 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -614,7 +614,11 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ std::unordered_map column_name_to_node; if (!storage_snapshot->tryGetColumn(get_column_options, "_table")) - column_name_to_node.emplace("_table", std::make_shared(current_storage_id.table_name)); + { + auto table_name_node = std::make_shared(current_storage_id.table_name); + table_name_node->setAlias("_table"); + column_name_to_node.emplace("_table", table_name_node); + } if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); From 118b84703bb0f08aa622b956b1207d9092f5f2d7 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 21 Jun 2023 01:51:34 +0200 Subject: [PATCH 008/884] WIP on StorageMerge and distributed JOIN --- src/Analyzer/ColumnNode.h | 5 ++ src/Storages/StorageMerge.cpp | 86 ++++++++++++++++++++++++++++++++--- src/Storages/StorageMerge.h | 2 +- 3 files changed, 86 insertions(+), 7 deletions(-) diff --git a/src/Analyzer/ColumnNode.h b/src/Analyzer/ColumnNode.h index b320df788c5..46e7c8eb500 100644 --- a/src/Analyzer/ColumnNode.h +++ b/src/Analyzer/ColumnNode.h @@ -108,6 +108,11 @@ public: */ QueryTreeNodePtr getColumnSourceOrNull() const; + void setColumnSource(const QueryTreeNodePtr & source) + { + getSourceWeakPointer() = source; + } + QueryTreeNodeType getNodeType() const override { return QueryTreeNodeType::COLUMN; diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 22308c1d901..85ec21b4765 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -51,6 +52,7 @@ #include #include #include +#include #include #include @@ -583,6 +585,76 @@ public: } }; +bool hasUnknownColumn(const QueryTreeNodePtr & node, + QueryTreeNodePtr original_table_expression, + QueryTreeNodePtr replacement_table_expression) +{ + QueryTreeNodes stack = { node }; + while (!stack.empty()) + { + auto current = stack.back(); + stack.pop_back(); + + switch (current->getNodeType()) + { + case QueryTreeNodeType::CONSTANT: + break; + case QueryTreeNodeType::COLUMN: + { + auto * column_node = current->as(); + auto source = column_node->getColumnSourceOrNull(); + if (source != original_table_expression) + return true; + else + column_node->setColumnSource(replacement_table_expression); + break; + } + default: + { + for (const auto & child : node->getChildren()) + { + if (child) + stack.push_back(child); + } + } + } + } + return false; +} + +QueryTreeNodePtr removeJoin( + QueryTreeNodePtr query, + QueryTreeNodePtr original_table_expression, + QueryTreeNodePtr replacement_table_expression) +{ + auto * query_node = query->as(); + auto modified_query = query_node->cloneAndReplace(query_node->getJoinTree(), replacement_table_expression); + + query_node = modified_query->as(); + query_node->getGroupBy().getNodes().clear(); + query_node->getHaving() = {}; + query_node->getOrderBy().getNodes().clear(); + + auto & projection = query_node->getProjection().getNodes(); + auto projection_columns = query_node->getProjectionColumns(); + for (size_t i = 0; i < projection.size();) + { + if (hasUnknownColumn(projection[i], original_table_expression, replacement_table_expression)) + { + projection.erase(projection.begin() + i); + projection_columns.erase(projection_columns.begin() + i); + continue; + } + ++i; + } + + query_node->resolveProjectionColumns(std::move(projection_columns)); + + LOG_DEBUG(&Poco::Logger::get("removeJoin"), "Query without JOIN:\n{}", modified_query->dumpTree()); + + return modified_query; +} + } SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_context, @@ -602,8 +674,9 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ if (query_info.table_expression_modifiers) replacement_table_expression->setTableExpressionModifiers(*query_info.table_expression_modifiers); - modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, - replacement_table_expression); + modified_query_info.query_tree = removeJoin(modified_query_info.query_tree, modified_query_info.table_expression, replacement_table_expression); + // modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, + // replacement_table_expression); modified_query_info.table_expression = replacement_table_expression; modified_query_info.planner_context->getOrCreateTableExpressionData(replacement_table_expression); @@ -877,7 +950,7 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( /// Subordinary tables could have different but convertible types, like numeric types of different width. /// We must return streams with structure equals to structure of Merge table. - convertingSourceStream(header, modified_query_info, storage_snapshot->metadata, aliases, modified_context, *builder, processed_stage); + convertingSourceStream(header, modified_query_info, storage_snapshot, aliases, modified_context, *builder, processed_stage); } return builder; @@ -1052,7 +1125,7 @@ void StorageMerge::alter( void ReadFromMerge::convertingSourceStream( const Block & header, SelectQueryInfo & modified_query_info, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & snapshot, const Aliases & aliases, ContextMutablePtr local_context, QueryPipelineBuilder & builder, @@ -1060,7 +1133,7 @@ void ReadFromMerge::convertingSourceStream( { Block before_block_header = builder.getHeader(); - auto storage_sample_block = metadata_snapshot->getSampleBlock(); + auto storage_sample_block = snapshot->metadata->getSampleBlock(); auto pipe_columns = builder.getHeader().getNamesAndTypesList(); if (local_context->getSettingsRef().allow_experimental_analyzer) @@ -1115,7 +1188,8 @@ void ReadFromMerge::convertingSourceStream( ActionsDAG::MatchColumnsMode convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Name; - if (local_context->getSettingsRef().allow_experimental_analyzer && processed_stage != QueryProcessingStage::FetchColumns) + if (local_context->getSettingsRef().allow_experimental_analyzer + && (processed_stage != QueryProcessingStage::FetchColumns || dynamic_cast(&snapshot->storage) != nullptr)) convert_actions_match_columns_mode = ActionsDAG::MatchColumnsMode::Position; auto convert_actions_dag = ActionsDAG::makeConvertingActions(builder.getHeader().getColumnsWithTypeAndName(), diff --git a/src/Storages/StorageMerge.h b/src/Storages/StorageMerge.h index 987869e5de3..de9480292f9 100644 --- a/src/Storages/StorageMerge.h +++ b/src/Storages/StorageMerge.h @@ -199,7 +199,7 @@ private: static void convertingSourceStream( const Block & header, SelectQueryInfo & modified_query_info, - const StorageMetadataPtr & metadata_snapshot, + const StorageSnapshotPtr & snapshot, const Aliases & aliases, ContextMutablePtr context, QueryPipelineBuilder & builder, From 88fe30254a280286ac2bd2b6bcdc71865ec2aed2 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 21 Jun 2023 17:55:14 +0000 Subject: [PATCH 009/884] Small fixup --- src/Storages/StorageMerge.cpp | 12 +++++++++--- tests/broken_tests.txt | 1 - 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 85ec21b4765..d1ac3f57ae1 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -631,6 +631,10 @@ QueryTreeNodePtr removeJoin( auto modified_query = query_node->cloneAndReplace(query_node->getJoinTree(), replacement_table_expression); query_node = modified_query->as(); + + //TODO: change the predicates to make it valid and execute it on shards. + query_node->getPrewhere() = {}; + query_node->getWhere() = {}; query_node->getGroupBy().getNodes().clear(); query_node->getHaving() = {}; query_node->getOrderBy().getNodes().clear(); @@ -675,8 +679,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ replacement_table_expression->setTableExpressionModifiers(*query_info.table_expression_modifiers); modified_query_info.query_tree = removeJoin(modified_query_info.query_tree, modified_query_info.table_expression, replacement_table_expression); - // modified_query_info.query_tree = modified_query_info.query_tree->cloneAndReplace(modified_query_info.table_expression, - // replacement_table_expression); modified_query_info.table_expression = replacement_table_expression; modified_query_info.planner_context->getOrCreateTableExpressionData(replacement_table_expression); @@ -694,7 +696,11 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ } if (!storage_snapshot->tryGetColumn(get_column_options, "_database")) - column_name_to_node.emplace("_database", std::make_shared(current_storage_id.database_name)); + { + auto database_name_node = std::make_shared(current_storage_id.database_name); + database_name_node->setAlias("_database"); + column_name_to_node.emplace("_database", database_name_node); + } auto storage_columns = storage_snapshot->metadata->getColumns(); diff --git a/tests/broken_tests.txt b/tests/broken_tests.txt index e6b5fb4f631..f6e21a29eed 100644 --- a/tests/broken_tests.txt +++ b/tests/broken_tests.txt @@ -38,7 +38,6 @@ 01527_dist_sharding_key_dictGet_reload 01528_allow_nondeterministic_optimize_skip_unused_shards 01540_verbatim_partition_pruning -01560_merge_distributed_join 01563_distributed_query_finish 01576_alias_column_rewrite 01583_const_column_in_set_index From 47fafdc32c320464bbd65468208bbc8e5b7ac62f Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Wed, 21 Jun 2023 18:06:24 +0000 Subject: [PATCH 010/884] Code cleanup --- src/Storages/StorageDistributed.cpp | 1 - src/Storages/StorageMerge.cpp | 35 ++++++++--------------------- 2 files changed, 9 insertions(+), 27 deletions(-) diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp index 9f9f0fda9e2..b948ca946c3 100644 --- a/src/Storages/StorageDistributed.cpp +++ b/src/Storages/StorageDistributed.cpp @@ -30,7 +30,6 @@ #include #include #include -#include "Analyzer/IQueryTreeNode.h" #include #include diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index d1ac3f57ae1..1a0376edbf5 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -28,19 +28,17 @@ #include #include #include -#include "Common/logger_useful.h" #include #include -#include "Analyzer/ColumnNode.h" -#include "Analyzer/IQueryTreeNode.h" -#include "Analyzer/Identifier.h" -#include "Analyzer/IdentifierNode.h" -#include "Analyzer/InDepthQueryTreeVisitor.h" -#include "Analyzer/Passes/QueryAnalysisPass.h" -#include "Analyzer/QueryTreeBuilder.h" -#include "Core/NamesAndTypes.h" -#include "DataTypes/IDataType.h" -#include "Planner/PlannerActionsVisitor.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include #include #include #include @@ -52,7 +50,6 @@ #include #include #include -#include #include #include @@ -654,8 +651,6 @@ QueryTreeNodePtr removeJoin( query_node->resolveProjectionColumns(std::move(projection_columns)); - LOG_DEBUG(&Poco::Logger::get("removeJoin"), "Query without JOIN:\n{}", modified_query->dumpTree()); - return modified_query; } @@ -718,11 +713,8 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ if (is_alias) { - // column_node = buildQueryTree(column_default->expression, modified_context); QueryTreeNodePtr fake_node = std::make_shared(Identifier{column}); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT before: {}\n{}", fake_node->dumpTree(), modified_query_info.table_expression->dumpTree()); - QueryAnalysisPass query_analysis_pass(modified_query_info.table_expression); query_analysis_pass.run(fake_node, modified_context); @@ -732,8 +724,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ ApplyAliasColumnExpressionsVisitor visitor; visitor.visit(column_node); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "QT after: {}", column_node->dumpTree()); - if (!resolved_column || !resolved_column->getExpression()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Alias column is not resolved"); @@ -752,8 +742,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ column_names_as_aliases = filter_actions_dag->getRequiredColumnsNames(); if (column_names_as_aliases.empty()) column_names_as_aliases.push_back(ExpressionActions::getSmallestColumn(storage_snapshot->metadata->getColumns().getAllPhysical()).name); - - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Required names: {}", toString(column_names_as_aliases)); } if (!column_name_to_node.empty()) @@ -764,7 +752,6 @@ SelectQueryInfo ReadFromMerge::getModifiedQueryInfo(const ContextPtr & modified_ } modified_query_info.query = queryNodeToSelectQuery(modified_query_info.query_tree); - LOG_DEBUG(&Poco::Logger::get("getModifiedQueryInfo"), "Modified query: {}", modified_query_info.query->formatForLogging()); } else { @@ -811,8 +798,6 @@ QueryPipelineBuilderPtr ReadFromMerge::createSources( modified_select.setFinal(); } - LOG_DEBUG(&Poco::Logger::get("createSources"), "real_column_names: {}", toString(real_column_names)); - bool allow_experimental_analyzer = modified_context->getSettingsRef().allow_experimental_analyzer; auto storage_stage = storage->getQueryProcessingStage(modified_context, @@ -1205,8 +1190,6 @@ void ReadFromMerge::convertingSourceStream( std::move(convert_actions_dag), ExpressionActionsSettings::fromContext(local_context, CompileExpressions::yes)); - LOG_DEBUG(&Poco::Logger::get("convertingSourceStream"), "The header: {}", builder.getHeader().dumpStructure()); - builder.addSimpleTransform([&](const Block & stream_header) { return std::make_shared(stream_header, actions); From 97a1ea01badaba10235ab0b01777f324b2f8365e Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Thu, 22 Jun 2023 15:10:53 +0000 Subject: [PATCH 011/884] Fix removeJoin --- src/Storages/StorageMerge.cpp | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 1a0376edbf5..fd7c0aae479 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -625,7 +626,8 @@ QueryTreeNodePtr removeJoin( QueryTreeNodePtr replacement_table_expression) { auto * query_node = query->as(); - auto modified_query = query_node->cloneAndReplace(query_node->getJoinTree(), replacement_table_expression); + auto join_tree = query_node->getJoinTree(); + auto modified_query = query_node->cloneAndReplace(join_tree, replacement_table_expression); query_node = modified_query->as(); @@ -636,20 +638,23 @@ QueryTreeNodePtr removeJoin( query_node->getHaving() = {}; query_node->getOrderBy().getNodes().clear(); - auto & projection = query_node->getProjection().getNodes(); - auto projection_columns = query_node->getProjectionColumns(); - for (size_t i = 0; i < projection.size();) + if (join_tree->as() == nullptr && join_tree->as() == nullptr) { - if (hasUnknownColumn(projection[i], original_table_expression, replacement_table_expression)) + auto & projection = query_node->getProjection().getNodes(); + auto projection_columns = query_node->getProjectionColumns(); + for (size_t i = 0; i < projection.size();) { - projection.erase(projection.begin() + i); - projection_columns.erase(projection_columns.begin() + i); - continue; + if (hasUnknownColumn(projection[i], original_table_expression, replacement_table_expression)) + { + projection.erase(projection.begin() + i); + projection_columns.erase(projection_columns.begin() + i); + continue; + } + ++i; } - ++i; - } - query_node->resolveProjectionColumns(std::move(projection_columns)); + query_node->resolveProjectionColumns(std::move(projection_columns)); + } return modified_query; } From 83022b77714a204ef4025d0b5081fbc127f2a586 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 2 Sep 2023 21:56:36 +0200 Subject: [PATCH 012/884] Added support for parameterized view with analyzer by analyzing the select part with default values --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 47 +++++++++++++++++++++ src/Analyzer/TableFunctionNode.cpp | 7 +++ src/Analyzer/TableFunctionNode.h | 3 ++ src/Interpreters/InterpreterCreateQuery.cpp | 38 +++++++++++++++-- 4 files changed, 92 insertions(+), 3 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 348189854e8..c82d3079118 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -77,6 +77,12 @@ #include #include #include +#include +#include +#include +#include +#include +#include namespace ProfileEvents { @@ -6210,8 +6216,49 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, QueryExpressionsAliasVisitor & expressions_visitor, bool nested_table_function) { + + String database_name = scope.context->getCurrentDatabase(); + String table_name = table_function_node->getOriginalAST()->as()->name; + + if (table_function_node->getOriginalAST()->as()->is_compound_name) + { + std::vector parts; + splitInto<'.'>(parts, table_function_node->getOriginalAST()->as()->name); + + if (parts.size() == 2) + { + database_name = parts[0]; + table_name = parts[1]; + } + } + auto & table_function_node_typed = table_function_node->as(); + StoragePtr table = DatabaseCatalog::instance().tryGetTable({database_name, table_name}, scope.context->getQueryContext()); + if (table) + { + if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) + { + auto query = table->getInMemoryMetadataPtr()->getSelectQuery().inner_query->clone(); + NameToNameMap parameterized_view_values = analyzeFunctionParamValues(table_function_node->getOriginalAST()); + StorageView::replaceQueryParametersIfParametrizedView(query, parameterized_view_values); + + ASTCreateQuery create; + create.select = query->as(); + auto sample_block = InterpreterSelectWithUnionQuery::getSampleBlock(query, scope.context); + auto res = std::make_shared(StorageID(database_name, table_name), + create, + ColumnsDescription(sample_block.getNamesAndTypesList()), + /* comment */ "", + /* is_parameterized_view */ true); + res->startup(); + table_function_node->getOriginalAST()->as()->prefer_subquery_to_function_formatting = true; + table_function_node_typed.resolve(std::move(res), scope.context); + return; + } + } + + if (!nested_table_function) expressions_visitor.visit(table_function_node_typed.getArgumentsNode()); diff --git a/src/Analyzer/TableFunctionNode.cpp b/src/Analyzer/TableFunctionNode.cpp index e5158a06373..f4ffe7f4ee5 100644 --- a/src/Analyzer/TableFunctionNode.cpp +++ b/src/Analyzer/TableFunctionNode.cpp @@ -36,6 +36,13 @@ void TableFunctionNode::resolve(TableFunctionPtr table_function_value, StoragePt unresolved_arguments_indexes = std::move(unresolved_arguments_indexes_); } +void TableFunctionNode::resolve(StoragePtr storage_value, ContextPtr context) +{ + storage = std::move(storage_value); + storage_id = storage->getStorageID(); + storage_snapshot = storage->getStorageSnapshot(storage->getInMemoryMetadataPtr(), context); +} + const StorageID & TableFunctionNode::getStorageID() const { if (!storage) diff --git a/src/Analyzer/TableFunctionNode.h b/src/Analyzer/TableFunctionNode.h index 69237ac8416..ed1a26c4dd4 100644 --- a/src/Analyzer/TableFunctionNode.h +++ b/src/Analyzer/TableFunctionNode.h @@ -100,6 +100,9 @@ public: /// Resolve table function with table function, storage and context void resolve(TableFunctionPtr table_function_value, StoragePtr storage_value, ContextPtr context, std::vector unresolved_arguments_indexes_); + /// Resolve table function as parameterized view with storage and context + void resolve(StoragePtr storage_value, ContextPtr context); + /// Get storage id, throws exception if function node is not resolved const StorageID & getStorageID() const; diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 92d74f4f18a..58b6722aae9 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -76,7 +76,8 @@ #include #include - +#include +#include namespace DB { @@ -745,12 +746,43 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti } else if (create.select) { - Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) { - as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(create.select->clone(), getContext()); + if (create.isParameterizedView()) + { + auto select = create.select->clone(); + + ///Get all query parameters + const auto parameters = analyzeReceiveQueryParamsWithType(select); + NameToNameMap parameter_values; + + for (const auto & parameter : parameters) + { + const auto data_type = DataTypeFactory::instance().get(parameter.second); + /// Todo improve getting default values & include more datatypes + if (data_type->isValueRepresentedByNumber() || parameter.second == "String") + parameter_values[parameter.first] = "1"; + else if (parameter.second.starts_with("Array") || parameter.second.starts_with("Map")) + parameter_values[parameter.first] = "[]"; + else + parameter_values[parameter.first] = " "; + LOG_INFO(&Poco::Logger::get("InterpreterCreateQuery"), "parameter = {} = {} ", parameter.first, parameter_values[parameter.first]); + + } + + /// Replace with default parameters + ReplaceQueryParameterVisitor visitor(parameter_values); + visitor.visit(select); + + as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(select, getContext()); + } + else + { + as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(create.select->clone(), getContext()); + } + } else { From 2dfda84da0e16c594df7df4eb2b05ee1baba1193 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 2 Sep 2023 21:57:57 +0200 Subject: [PATCH 013/884] Removed parameterized view tests from analyzer_tech_debt.txt --- tests/analyzer_tech_debt.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 15d46403da9..5521234495f 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -88,7 +88,6 @@ 02402_merge_engine_with_view 02404_memory_bound_merging 02426_orc_bug -02428_parameterized_view 02458_use_structure_from_insertion_table 02479_race_condition_between_insert_and_droppin_mv 02493_inconsistent_hex_and_binary_number @@ -123,7 +122,6 @@ 02581_share_big_sets_between_mutation_tasks_long 02581_share_big_sets_between_multiple_mutations_tasks_long 00992_system_parts_race_condition_zookeeper_long -02818_parameterized_view_with_cte_multiple_usage 02790_optimize_skip_unused_shards_join 01940_custom_tld_sharding_key 02815_range_dict_no_direct_join From 59195e1199d5c8ed31f4243b58f3186771219295 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 4 Sep 2023 19:03:23 +0200 Subject: [PATCH 014/884] Removed log for each parameter --- src/Interpreters/InterpreterCreateQuery.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 58b6722aae9..66c219dcd56 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -768,8 +768,6 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti parameter_values[parameter.first] = "[]"; else parameter_values[parameter.first] = " "; - LOG_INFO(&Poco::Logger::get("InterpreterCreateQuery"), "parameter = {} = {} ", parameter.first, parameter_values[parameter.first]); - } /// Replace with default parameters From 961bf074daf0c901a3e9d14b6caa4ba6cb37cc7c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 20 Nov 2023 10:56:10 +0100 Subject: [PATCH 015/884] Initial draft version of adding backup support to AzureBlobStorage --- src/Backups/BackupFactory.cpp | 2 + src/Backups/BackupIO_AzureBlobStorage.cpp | 336 ++++++++++++++++++ src/Backups/BackupIO_AzureBlobStorage.h | 69 ++++ src/Backups/BackupImpl.cpp | 8 +- .../registerBackupEngineAzureBlobStorage.cpp | 134 +++++++ src/CMakeLists.txt | 3 + src/Common/ProfileEvents.cpp | 4 + .../copyAzureBlobStorageFile.cpp | 324 +++++++++++++++++ .../copyAzureBlobStorageFile.h | 58 +++ src/Storages/StorageAzureBlob.cpp | 11 + src/Storages/StorageAzureBlob.h | 1 + .../__init__.py | 1 + .../configs/config.xml | 11 + .../configs/disable_profilers.xml | 13 + .../configs/users.xml | 8 + .../test.py | 151 ++++++++ 16 files changed, 1132 insertions(+), 2 deletions(-) create mode 100644 src/Backups/BackupIO_AzureBlobStorage.cpp create mode 100644 src/Backups/BackupIO_AzureBlobStorage.h create mode 100644 src/Backups/registerBackupEngineAzureBlobStorage.cpp create mode 100644 src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp create mode 100644 src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/__init__.py create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml create mode 100644 tests/integration/test_backup_restore_azure_blob_storage/test.py diff --git a/src/Backups/BackupFactory.cpp b/src/Backups/BackupFactory.cpp index 898ac7bc490..31e87a21fc2 100644 --- a/src/Backups/BackupFactory.cpp +++ b/src/Backups/BackupFactory.cpp @@ -33,11 +33,13 @@ void BackupFactory::registerBackupEngine(const String & engine_name, const Creat void registerBackupEnginesFileAndDisk(BackupFactory &); void registerBackupEngineS3(BackupFactory &); +void registerBackupEngineAzureBlobStorage(BackupFactory &); void registerBackupEngines(BackupFactory & factory) { registerBackupEnginesFileAndDisk(factory); registerBackupEngineS3(factory); + registerBackupEngineAzureBlobStorage(factory); } BackupFactory::BackupFactory() diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp new file mode 100644 index 00000000000..d41d23e3c36 --- /dev/null +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -0,0 +1,336 @@ +#include + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + + +namespace fs = std::filesystem; + +namespace DB +{ +namespace ErrorCodes +{ + extern const int AZURE_BLOB_STORAGE_ERROR; + extern const int LOGICAL_ERROR; +} + +//using AzureClientPtr = std::shared_ptr; + +BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( + StorageAzureBlob::Configuration configuration_, + const ReadSettings & read_settings_, + const WriteSettings & write_settings_, + const ContextPtr & context_) + : BackupReaderDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupReaderAzureBlobStorage")) + , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , configuration(configuration_) +{ + client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); + auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); + object_storage = std::make_unique("BackupReaderAzureBlobStorage", + std::make_unique(*client.get()), + std::move(settings_as_unique_ptr)); +} + +BackupReaderAzureBlobStorage::~BackupReaderAzureBlobStorage() = default; + +bool BackupReaderAzureBlobStorage::fileExists(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + return object_storage->exists(StoredObject(key)); +} + +UInt64 BackupReaderAzureBlobStorage::getFileSize(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + ObjectMetadata object_metadata = object_storage->getObjectMetadata(key); + return object_metadata.size_bytes; +} + +std::unique_ptr BackupReaderAzureBlobStorage::readFile(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + return std::make_unique( + client, key, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); +} + +void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) +{ + LOG_INFO(&Poco::Logger::get("BackupReaderAzureBlobStorage"), "Enter copyFileToDisk"); + + /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. + /// We don't check for `has_throttling` here because the native copy almost doesn't use network. + auto destination_data_source_description = destination_disk->getDataSourceDescription(); + if (destination_data_source_description.sameKind(data_source_description) + && (destination_data_source_description.is_encrypted == encrypted_in_backup)) + { + LOG_TRACE(log, "Copying {} from AzureBlobStorage to disk {}", path_in_backup, destination_disk->getName()); + auto write_blob_function = [&](const Strings & blob_path, WriteMode mode, const std::optional & object_attributes) -> size_t + { + /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. + if (blob_path.size() != 2 || mode != WriteMode::Rewrite) + throw Exception(ErrorCodes::LOGICAL_ERROR, + "Blob writing function called with unexpected blob_path.size={} or mode={}", + blob_path.size(), mode); + + std::shared_ptr dest_client; + if (configuration.container == blob_path[1]) + { + dest_client = client; + } + else + { + StorageAzureBlob::Configuration dest_configuration = configuration; + dest_configuration.container = blob_path[1]; + dest_configuration.blob_path = blob_path[0]; + dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); + } + + + copyAzureBlobStorageFile( + client, + dest_client, + configuration.container, + fs::path(configuration.blob_path) / path_in_backup, + 0, + file_size, + /* dest_bucket= */ blob_path[1], + /* dest_key= */ blob_path[0], + settings, + read_settings, + object_attributes, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupReaderAzureBlobStorage"), + /* for_disk_azure_blob_storage= */ true); + + return file_size; + }; + + destination_disk->writeFileUsingBlobWritingFunction(destination_path, write_mode, write_blob_function); + return; /// copied! + } + + /// Fallback to copy through buffers. + BackupReaderDefault::copyFileToDisk(path_in_backup, file_size, encrypted_in_backup, destination_disk, destination_path, write_mode); +} + + +BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( + StorageAzureBlob::Configuration configuration_, + const ReadSettings & read_settings_, + const WriteSettings & write_settings_, + const ContextPtr & context_) + : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterAzureBlobStorage")) + , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , configuration(configuration_) +{ + client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); + auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); + object_storage = std::make_unique("BackupWriterAzureBlobStorage", + std::make_unique(*client.get()), + std::move(settings_as_unique_ptr)); +} + +void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) +{ + /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. + auto source_data_source_description = src_disk->getDataSourceDescription(); + if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) + { + /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in AzureBlobStorage bucket. + /// In this case we can't use the native copy. + if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) + { + + std::shared_ptr src_client; + if (configuration.container == blob_path[1]) + { + src_client = client; + } + else + { + StorageAzureBlob::Configuration src_configuration = configuration; + src_configuration.container = blob_path[1]; + src_configuration.blob_path = blob_path[0]; + src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); + } + + LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); + copyAzureBlobStorageFile( + src_client, + client, + /* src_bucket */ blob_path[1], + /* src_key= */ blob_path[0], + start_pos, + length, + configuration.container, + fs::path(configuration.blob_path) / path_in_backup, + settings, + read_settings, + {}, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); + return; /// copied! + } + } + + /// Fallback to copy through buffers. + BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); +} + +void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) +{ + copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); +} + +BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; + +bool BackupWriterAzureBlobStorage::fileExists(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + LOG_INFO(&Poco::Logger::get("BackupWriterAzureBlobStorage"), "Result fileExists {} ", object_storage->exists(StoredObject(key))); + + return object_storage->exists(StoredObject(key)); +} + +UInt64 BackupWriterAzureBlobStorage::getFileSize(const String & file_name) +{ + LOG_INFO(&Poco::Logger::get("BackupWriterAzureBlobStorage"), "Enter getFileSize"); + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + RelativePathsWithMetadata children; + object_storage->listObjects(key,children,/*max_keys*/0); + if (children.empty()) + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object {} must exist"); + return children[0].metadata.size_bytes; +} + +std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String & file_name, size_t /*expected_file_size*/) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + + return std::make_unique( + client, key, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); +} + +std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + return std::make_unique( + client, + key, + settings->max_single_part_upload_size, + DBMS_DEFAULT_BUFFER_SIZE, + write_settings); +} + +void BackupWriterAzureBlobStorage::removeFile(const String & file_name) +{ + String key; + if (startsWith(file_name, ".")) + { + key= configuration.blob_path + file_name; + } + else + { + key = file_name; + } + StoredObject object(key); + object_storage->removeObjectIfExists(object); +} + +void BackupWriterAzureBlobStorage::removeFiles(const Strings & keys) +{ + StoredObjects objects; + for (const auto & key : keys) + objects.emplace_back(key); + + object_storage->removeObjectsIfExist(objects); + +} + +void BackupWriterAzureBlobStorage::removeFilesBatch(const Strings & keys) +{ + StoredObjects objects; + for (const auto & key : keys) + objects.emplace_back(key); + + object_storage->removeObjectsIfExist(objects); +} + +} + +#endif diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h new file mode 100644 index 00000000000..6ef66fc432d --- /dev/null +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -0,0 +1,69 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include + + +namespace DB +{ + +// using AzureClientPtr = std::shared_ptr; + +/// Represents a backup stored to Azure + class BackupReaderAzureBlobStorage : public BackupReaderDefault + { + public: + BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupReaderAzureBlobStorage() override; + + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; + + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; + + private: + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; + }; + + + class BackupWriterAzureBlobStorage : public BackupWriterDefault + { + public: + BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupWriterAzureBlobStorage() override; + + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr writeFile(const String & file_name) override; + + void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + + void removeFile(const String & file_name) override; + void removeFiles(const Strings & file_names) override; + + private: + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; + void removeFilesBatch(const Strings & file_names); + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; + }; + +} + +#endif diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index bb97335d8fb..9363ca5e7a7 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -492,6 +492,7 @@ void BackupImpl::checkBackupDoesntExist() const else file_name_to_check_existence = ".backup"; + LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkBackupDoesntExist 1"); if (writer->fileExists(file_name_to_check_existence)) throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} already exists", backup_name_for_logging); @@ -499,6 +500,7 @@ void BackupImpl::checkBackupDoesntExist() const if (!is_internal_backup) { assert(!lock_file_name.empty()); + LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkBackupDoesntExist 2"); if (writer->fileExists(lock_file_name)) throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} is being written already", backup_name_for_logging); } @@ -522,6 +524,8 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const if (throw_if_failed) { + LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkLockFile"); + if (!writer->fileExists(lock_file_name)) { throw Exception( @@ -886,12 +890,12 @@ void BackupImpl::writeFile(const BackupFileInfo & info, BackupEntryPtr entry) } else if (src_disk && from_immutable_file) { - LOG_TRACE(log, "Writing backup for file {} from {} (disk {}): data file #{}", info.data_file_name, src_file_desc, src_disk->getName(), info.data_file_index); + LOG_INFO(log, "Writing backup for file {} from {} (disk {}): data file #{}", info.data_file_name, src_file_desc, src_disk->getName(), info.data_file_index); writer->copyFileFromDisk(info.data_file_name, src_disk, src_file_path, info.encrypted_by_disk, info.base_size, info.size - info.base_size); } else { - LOG_TRACE(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); + LOG_INFO(log, "Writing backup for file {} from {}: data file #{}", info.data_file_name, src_file_desc, info.data_file_index); auto create_read_buffer = [entry, read_settings = writer->getReadSettings()] { return entry->getReadBuffer(read_settings); }; writer->copyDataToFile(info.data_file_name, create_read_buffer, info.base_size, info.size - info.base_size); } diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp new file mode 100644 index 00000000000..6f7b5f38c28 --- /dev/null +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -0,0 +1,134 @@ +#include "config.h" + +#include +#include + +#if USE_AZURE_BLOB_STORAGE +#include +#include +#include +#include +#include +#include +#include +#endif + + +namespace DB +{ +namespace fs = std::filesystem; + +namespace ErrorCodes +{ + extern const int BAD_ARGUMENTS; + extern const int SUPPORT_IS_DISABLED; +} + +#if USE_AZURE_BLOB_STORAGE +namespace +{ + String removeFileNameFromURL(String & url) + { + Poco::URI url2{url}; + String path = url2.getPath(); + size_t slash_pos = path.find_last_of('/'); + String file_name = path.substr(slash_pos + 1); + path.resize(slash_pos + 1); + url2.setPath(path); + url = url2.toString(); + return file_name; + } +} +#endif + + +void registerBackupEngineAzureBlobStorage(BackupFactory & factory) +{ + auto creator_fn = []([[maybe_unused]] const BackupFactory::CreateParams & params) -> std::unique_ptr + { +#if USE_AZURE_BLOB_STORAGE + const String & id_arg = params.backup_info.id_arg; + const auto & args = params.backup_info.args; + + LOG_INFO(&Poco::Logger::get("registerBackupEngineAzureBlobStorage"), "Begin id_arg={} args.size={}", id_arg, args.size()); + + StorageAzureBlob::Configuration configuration; + + if (args.size() == 4) + { + configuration.connection_url = args[0].safeGet(); + configuration.is_connection_string = true; + + configuration.container = args[1].safeGet(); + configuration.blob_path = args[2].safeGet(); + configuration.format = args[3].safeGet(); + + LOG_TRACE(&Poco::Logger::get("registerBackupEngineAzureBlobStorage"), "configuration.connection_url = {}" + "configuration.container = {}" + "configuration.blob_path = {}" + "configuration.format = {}", + configuration.connection_url, configuration.container, configuration.blob_path, configuration.format); + } + + + BackupImpl::ArchiveParams archive_params; + if (hasRegisteredArchiveFileExtension(configuration.blob_path)) + { + if (params.is_internal_backup) + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "Using archives with backups on clusters is disabled"); + + archive_params.archive_name = removeFileNameFromURL(configuration.blob_path); + archive_params.compression_method = params.compression_method; + archive_params.compression_level = params.compression_level; + archive_params.password = params.password; + } + else + { + if (!params.password.empty()) + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Password is not applicable, backup cannot be encrypted"); + } + + + if (params.open_mode == IBackup::OpenMode::READ) + { + auto reader = std::make_shared(configuration, + params.read_settings, + params.write_settings, + params.context); + + return std::make_unique( + params.backup_info, + archive_params, + params.base_backup_info, + reader, + params.context, + /*params.use_same_s3_credentials_for_base_backup*/ false); + } + else + { + auto writer = std::make_shared(configuration, + params.read_settings, + params.write_settings, + params.context); + + return std::make_unique( + params.backup_info, + archive_params, + params.base_backup_info, + writer, + params.context, + params.is_internal_backup, + params.backup_coordination, + params.backup_uuid, + params.deduplicate_files, + /*params.use_same_s3_credentials_for_base_backup*/ false); + } +#else + throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "AzureBlobStorage support is disabled"); +#endif + }; + + factory.registerBackupEngine("AzureBlobStorage", creator_fn); +} + +} diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0257b7d329b..984594a6541 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -87,6 +87,7 @@ add_headers_and_sources(clickhouse_common_io IO) add_headers_and_sources(clickhouse_common_io IO/Archives) add_headers_and_sources(clickhouse_common_io IO/Resource) add_headers_and_sources(clickhouse_common_io IO/S3) +add_headers_and_sources(clickhouse_common_io IO/AzureBlobStorage) list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp) @@ -139,6 +140,7 @@ endif() if (TARGET ch_contrib::azure_sdk) add_headers_and_sources(dbms Disks/ObjectStorages/AzureBlobStorage) + add_headers_and_sources(dbms IO/AzureBlobStorage) endif() if (TARGET ch_contrib::hdfs) @@ -485,6 +487,7 @@ if (TARGET ch_contrib::aws_s3) endif() if (TARGET ch_contrib::azure_sdk) + target_link_libraries (clickhouse_common_io PUBLIC ch_contrib::azure_sdk) dbms_target_link_libraries (PRIVATE ch_contrib::azure_sdk) endif() diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index 58e860ebcaf..1655d19986a 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -361,6 +361,10 @@ The server successfully detected this situation and will download merged part fr M(S3PutObject, "Number of S3 API PutObject calls.") \ M(S3GetObject, "Number of S3 API GetObject calls.") \ \ + M(AzureUploadPart, "Number of Azure blob storage API UploadPart calls") \ + M(DiskAzureUploadPart, "Number of Disk Azure blob storage API UploadPart calls") \ + M(AzureCopyObject, "Number of Azure blob storage API CopyObject calls") \ + M(DiskAzureCopyObject, "Number of Disk Azure blob storage API CopyObject calls") \ M(AzureDeleteObjects, "Number of Azure blob storage API DeleteObject(s) calls.") \ M(AzureListObjects, "Number of Azure blob storage API ListObjects calls.") \ \ diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp new file mode 100644 index 00000000000..bf0bcac664b --- /dev/null +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -0,0 +1,324 @@ +#include + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace ProfileEvents +{ + extern const Event AzureCopyObject; + extern const Event AzureUploadPart; + + extern const Event DiskAzureCopyObject; + extern const Event DiskAzureUploadPart; +} + + +namespace DB +{ + +size_t max_single_operation_copy_size = 256 * 1024 * 1024; + + +namespace +{ + class UploadHelper + { + public: + UploadHelper( + const CreateReadBuffer & create_read_buffer_, + std::shared_ptr client_, + size_t offset_, + size_t total_size_, + const String & dest_bucket_, + const String & dest_key_, + std::shared_ptr settings_, + const std::optional> & object_metadata_, + ThreadPoolCallbackRunner schedule_, + bool for_disk_azure_blob_storage_) + : create_read_buffer(create_read_buffer_) + , client(client_) + , offset (offset_) + , total_size (total_size_) + , dest_bucket(dest_bucket_) + , dest_key(dest_key_) + , settings(settings_) + , object_metadata(object_metadata_) + , schedule(schedule_) + , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) + , log(&Poco::Logger::get("azureBlobStorageUploadHelper")) + , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) + { + } + + ~UploadHelper() {} + + protected: + std::function()> create_read_buffer; + std::shared_ptr client; + size_t offset; + size_t total_size; + const String & dest_bucket; + const String & dest_key; + std::shared_ptr settings; + const std::optional> & object_metadata; + ThreadPoolCallbackRunner schedule; + bool for_disk_azure_blob_storage; + const Poco::Logger * log; + size_t max_single_part_upload_size; + + struct UploadPartTask + { + char *data = nullptr; + size_t size = 0; + std::string block_id; + bool is_finished = false; + std::exception_ptr exception; + + ~UploadPartTask() + { + if (data != nullptr) + free(data); + } + }; + + size_t normal_part_size; + std::vector block_ids; + + std::list TSA_GUARDED_BY(bg_tasks_mutex) bg_tasks; + int num_added_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + int num_finished_bg_tasks TSA_GUARDED_BY(bg_tasks_mutex) = 0; + std::mutex bg_tasks_mutex; + std::condition_variable bg_tasks_condvar; + + public: + void performCopy() + { + performMultipartUpload(); + } + + void completeMultipartUpload() + { + auto block_blob_client = client->GetBlockBlobClient(dest_key); + block_blob_client.CommitBlockList(block_ids); + } + + void performMultipartUpload() + { + normal_part_size = 1024; + + size_t position = offset; + size_t end_position = offset + total_size; + + try + { + while (position < end_position) + { + size_t next_position = std::min(position + normal_part_size, end_position); + size_t part_size = next_position - position; /// `part_size` is either `normal_part_size` or smaller if it's the final part. + + uploadPart(position, part_size); + + position = next_position; + } + } + catch (...) + { + tryLogCurrentException(__PRETTY_FUNCTION__); + waitForAllBackgroundTasks(); + throw; + } + + waitForAllBackgroundTasks(); + completeMultipartUpload(); + } + + + void uploadPart(size_t part_offset, size_t part_size) + { + LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Size: {}", dest_bucket, dest_key, part_size); + + if (!part_size) + { + LOG_TRACE(log, "Skipping writing an empty part."); + return; + } + + if (schedule) + { + UploadPartTask * task = nullptr; + + { + std::lock_guard lock(bg_tasks_mutex); + task = &bg_tasks.emplace_back(); + ++num_added_bg_tasks; + } + + /// Notify waiting thread when task finished + auto task_finish_notify = [this, task]() + { + std::lock_guard lock(bg_tasks_mutex); + task->is_finished = true; + ++num_finished_bg_tasks; + + /// Notification under mutex is important here. + /// Otherwise, WriteBuffer could be destroyed in between + /// Releasing lock and condvar notification. + bg_tasks_condvar.notify_one(); + }; + + try + { + auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); + auto buffer = std::make_unique(std::move(read_buffer), part_size); + task->data = new char[part_size]; + task->size = part_size; + buffer->read(task->data,part_size); + task->block_id = getRandomASCIIString(64); + + schedule([this, task, task_finish_notify]() + { + try + { + processUploadTask(*task); + } + catch (...) + { + task->exception = std::current_exception(); + } + task_finish_notify(); + }, Priority{}); + } + catch (...) + { + task_finish_notify(); + throw; + } + } + else + { + UploadPartTask task; + auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); + auto buffer = std::make_unique(std::move(read_buffer), part_size); + task.data = new char[part_size]; + buffer->read(task.data,part_size); + task.size = part_size; + processUploadTask(task); + block_ids.emplace_back(task.block_id); + } + } + + void processUploadTask(UploadPartTask & task) + { + auto block_id = processUploadPartRequest(task); + + std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race + task.block_id = block_id; + LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, block_id: {}, Parts: {}", dest_bucket, dest_key, block_id, bg_tasks.size()); + } + + String processUploadPartRequest(UploadPartTask & task) + { + ProfileEvents::increment(ProfileEvents::AzureUploadPart); + if (for_disk_azure_blob_storage) + ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); + + auto block_blob_client = client->GetBlockBlobClient(dest_key); + task.block_id = getRandomASCIIString(64); + Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); + block_blob_client.StageBlock(task.block_id, memory); + + return task.block_id; + } + + + void waitForAllBackgroundTasks() + { + if (!schedule) + return; + + std::unique_lock lock(bg_tasks_mutex); + /// Suppress warnings because bg_tasks_mutex is actually hold, but tsa annotations do not understand std::unique_lock + bg_tasks_condvar.wait(lock, [this]() {return TSA_SUPPRESS_WARNING_FOR_READ(num_added_bg_tasks) == TSA_SUPPRESS_WARNING_FOR_READ(num_finished_bg_tasks); }); + + auto & tasks = TSA_SUPPRESS_WARNING_FOR_WRITE(bg_tasks); + for (auto & task : tasks) + { + if (task.exception) + std::rethrow_exception(task.exception); + block_ids.emplace_back(task.block_id); + } + } + }; +} + + +void copyDataToAzureBlobStorageFile( + const std::function()> & create_read_buffer, + size_t offset, + size_t size, + std::shared_ptr & dest_client, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const std::optional> & object_metadata, + ThreadPoolCallbackRunner schedule, + bool for_disk_azure_blob_storage) +{ + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + helper.performCopy(); +} + + +void copyAzureBlobStorageFile( + std::shared_ptr src_client, + std::shared_ptr dest_client, + const String & src_bucket, + const String & src_key, + size_t offset, + size_t size, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const ReadSettings & read_settings, + const std::optional> & object_metadata, + ThreadPoolCallbackRunner schedule, + bool for_disk_azure_blob_storage) +{ + + if (size < max_single_operation_copy_size) + { + ProfileEvents::increment(ProfileEvents::AzureCopyObject); + if (for_disk_azure_blob_storage) + ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); + auto block_blob_client_src = src_client->GetBlockBlobClient(src_key); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_key); + auto uri = block_blob_client_src.GetUrl(); + block_blob_client_dest.CopyFromUri(uri); + } + else + { + LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Bucket: {}, Key: {}", src_bucket, src_key); + auto create_read_buffer = [&] + { + return std::make_unique(src_client, src_key, read_settings, settings->max_single_read_retries, + settings->max_single_download_retries); + }; + + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + helper.performCopy(); + } +} + +} + +#endif diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h new file mode 100644 index 00000000000..31228fbcb23 --- /dev/null +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -0,0 +1,58 @@ +#pragma once + +#include "config.h" + +#if USE_AZURE_BLOB_STORAGE + +#include +#include +#include +#include +#include +#include + + +namespace DB +{ +class SeekableReadBuffer; + +using CreateReadBuffer = std::function()>; + +/// Copies a file from AzureBlobStorage to AzureBlobStorage. +/// The parameters `src_offset` and `src_size` specify a part in the source to copy. +void copyAzureBlobStorageFile( + std::shared_ptr src_client, + std::shared_ptr dest_client, + const String & src_bucket, + const String & src_key, + size_t src_offset, + size_t src_size, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const ReadSettings & read_settings, + const std::optional> & object_metadata = std::nullopt, + ThreadPoolCallbackRunner schedule_ = {}, + bool for_disk_azure_blob_storage = false); + + +/// Copies data from any seekable source to AzureBlobStorage. +/// The same functionality can be done by using the function copyData() and the class WriteBufferFromS3 +/// however copyDataToS3File() is faster and spends less memory. +/// The callback `create_read_buffer` can be called from multiple threads in parallel, so that should be thread-safe. +/// The parameters `offset` and `size` specify a part in the source to copy. +void copyDataToAzureBlobStorageFile( + const std::function()> & create_read_buffer, + size_t offset, + size_t size, + std::shared_ptr & client, + const String & dest_bucket, + const String & dest_key, + std::shared_ptr settings, + const std::optional> & object_metadata = std::nullopt, + ThreadPoolCallbackRunner schedule_ = {}, + bool for_disk_azure_blob_storage = false); + +} + +#endif diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 2e0703a8df3..e36604cfb1a 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -258,6 +258,17 @@ AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr loca return settings_ptr; } +std::shared_ptr StorageAzureBlob::createSettingsAsSharedPtr(ContextPtr local_context) +{ + const auto & context_settings = local_context->getSettingsRef(); + auto settings_ptr = std::make_shared(); + settings_ptr->max_single_part_upload_size = context_settings.azure_max_single_part_upload_size; + settings_ptr->max_single_read_retries = context_settings.azure_max_single_read_retries; + settings_ptr->list_object_keys_size = static_cast(context_settings.azure_list_object_keys_size); + + return settings_ptr; +} + void registerStorageAzureBlob(StorageFactory & factory) { factory.registerStorage("AzureBlobStorage", [](const StorageFactory::Arguments & args) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index b97dee0caed..570e4124d73 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -80,6 +80,7 @@ public: static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); + static std::shared_ptr createSettingsAsSharedPtr(ContextPtr local_context); static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); diff --git a/tests/integration/test_backup_restore_azure_blob_storage/__init__.py b/tests/integration/test_backup_restore_azure_blob_storage/__init__.py new file mode 100644 index 00000000000..e5a0d9b4834 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/__init__.py @@ -0,0 +1 @@ +#!/usr/bin/env python3 diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml new file mode 100644 index 00000000000..5725dce40cd --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml @@ -0,0 +1,11 @@ + + 1 + 0 + 0.0 + 0 + 1 + 1 + 0 + 16 + 16 + \ No newline at end of file diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml new file mode 100644 index 00000000000..b74bb1502ce --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml @@ -0,0 +1,13 @@ + + + + + 0 + 0 + 0 + 1000 + 1 + 1 + + + diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml new file mode 100644 index 00000000000..c12eb2f79f4 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml @@ -0,0 +1,8 @@ + + + + + default + + + diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py new file mode 100644 index 00000000000..2ecf08a4f40 --- /dev/null +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python3 + +import gzip +import json +import logging +import os +import io +import random +import threading +import time + +from azure.storage.blob import BlobServiceClient +import helpers.client +import pytest +from helpers.cluster import ClickHouseCluster, ClickHouseInstance +from helpers.network import PartitionManager +from helpers.mock_servers import start_mock_servers +from helpers.test_tools import exec_query_with_retry + + + +@pytest.fixture(scope="module") +def cluster(): + try: + cluster = ClickHouseCluster(__file__) + cluster.add_instance( + "node", + main_configs=["configs/config.xml"], + user_configs=["configs/disable_profilers.xml", "configs/users.xml"], + with_azurite=True, + ) + cluster.start() + + yield cluster + finally: + cluster.shutdown() + + +def azure_query( + node, query, expect_error="false", try_num=10, settings={}, query_on_retry=None +): + for i in range(try_num): + try: + if expect_error == "true": + return node.query_and_get_error(query, settings=settings) + else: + return node.query(query, settings=settings) + except Exception as ex: + retriable_errors = [ + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Connection closed before getting full response or response is less than expected", + "DB::Exception: Azure::Core::Http::TransportException: Connection was closed by the server while trying to read a response", + "DB::Exception: Azure::Core::Http::TransportException: Error while polling for socket ready read", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Connection closed before getting full response or response is less than expected", + "Azure::Core::Http::TransportException, e.what() = Connection was closed by the server while trying to read a response", + "Azure::Core::Http::TransportException, e.what() = Error while polling for socket ready read", + ] + retry = False + for error in retriable_errors: + if error in str(ex): + retry = True + print(f"Try num: {i}. Having retriable error: {ex}") + time.sleep(i) + break + if not retry or i == try_num - 1: + raise Exception(ex) + if query_on_retry is not None: + node.query(query_on_retry) + continue + + +def get_azure_file_content(filename, port): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string( + str(connection_string) + ) + container_client = blob_service_client.get_container_client(container_name) + blob_client = container_client.get_blob_client(filename) + download_stream = blob_client.download_blob() + return download_stream.readall().decode("utf-8") + + +def put_azure_file_content(filename, port, data): + container_name = "cont" + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + try: + container_client = blob_service_client.create_container(container_name) + except: + container_client = blob_service_client.get_container_client(container_name) + + blob_client = container_client.get_blob_client(filename) + buf = io.BytesIO(data) + blob_client.upload_blob(buf) + +@pytest.fixture(autouse=True, scope="function") +def delete_all_files(cluster): + port = cluster.env_variables["AZURITE_PORT"] + connection_string = ( + f"DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;" + f"AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;" + f"BlobEndpoint=http://127.0.0.1:{port}/devstoreaccount1;" + ) + blob_service_client = BlobServiceClient.from_connection_string(connection_string) + containers = blob_service_client.list_containers() + for container in containers: + container_client = blob_service_client.get_container_client(container) + blob_list = container_client.list_blobs() + for blob in blob_list: + print(blob) + blob_client = container_client.get_blob_client(blob) + blob_client.delete_blob() + + assert len(list(container_client.list_blobs())) == 0 + + yield + + +def test_create_table_connection_string(cluster): + node = cluster.instances["node"] + azure_query( + node, + f"CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_create_connection_string', 'CSV')", + ) + +def test_backup_restore(cluster): + node = cluster.instances["node"] + port = cluster.env_variables["AZURITE_PORT"] + azure_query( + node, + f"CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c.csv', 'CSV')", + ) + azure_query(node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") + print(get_azure_file_content("test_simple_write_c.csv", port)) + assert get_azure_file_content("test_simple_write_c.csv", port) == '1,"a"\n' + + backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c_backup.csv', 'CSV')" + azure_query(node,f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}") + print (get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) + azure_query(node, f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};") + assert(azure_query(node,f"SELECT * from test_simple_write_connection_string_restored") == "1\ta\n") \ No newline at end of file From 05b608cd76da8995086887f812e1ab3fceb99551 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Mon, 20 Nov 2023 10:12:45 +0000 Subject: [PATCH 016/884] Automatic style fix --- .../test.py | 24 ++++++++++++++----- 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py index 2ecf08a4f40..cda3cab07e4 100644 --- a/tests/integration/test_backup_restore_azure_blob_storage/test.py +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -18,7 +18,6 @@ from helpers.mock_servers import start_mock_servers from helpers.test_tools import exec_query_with_retry - @pytest.fixture(scope="module") def cluster(): try: @@ -103,6 +102,7 @@ def put_azure_file_content(filename, port, data): buf = io.BytesIO(data) blob_client.upload_blob(buf) + @pytest.fixture(autouse=True, scope="function") def delete_all_files(cluster): port = cluster.env_variables["AZURITE_PORT"] @@ -133,6 +133,7 @@ def test_create_table_connection_string(cluster): f"CREATE TABLE test_create_table_conn_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_create_connection_string', 'CSV')", ) + def test_backup_restore(cluster): node = cluster.instances["node"] port = cluster.env_variables["AZURITE_PORT"] @@ -140,12 +141,23 @@ def test_backup_restore(cluster): node, f"CREATE TABLE test_simple_write_connection_string (key UInt64, data String) Engine = AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c.csv', 'CSV')", ) - azure_query(node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')") + azure_query( + node, f"INSERT INTO test_simple_write_connection_string VALUES (1, 'a')" + ) print(get_azure_file_content("test_simple_write_c.csv", port)) assert get_azure_file_content("test_simple_write_c.csv", port) == '1,"a"\n' backup_destination = f"AzureBlobStorage('{cluster.env_variables['AZURITE_CONNECTION_STRING']}', 'cont', 'test_simple_write_c_backup.csv', 'CSV')" - azure_query(node,f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}") - print (get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) - azure_query(node, f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};") - assert(azure_query(node,f"SELECT * from test_simple_write_connection_string_restored") == "1\ta\n") \ No newline at end of file + azure_query( + node, + f"BACKUP TABLE test_simple_write_connection_string TO {backup_destination}", + ) + print(get_azure_file_content("test_simple_write_c_backup.csv.backup", port)) + azure_query( + node, + f"RESTORE TABLE test_simple_write_connection_string AS test_simple_write_connection_string_restored FROM {backup_destination};", + ) + assert ( + azure_query(node, f"SELECT * from test_simple_write_connection_string_restored") + == "1\ta\n" + ) From 6dfb1c25ec6a4a61a4fe329191c10263eb19ad07 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 20 Nov 2023 11:37:06 +0100 Subject: [PATCH 017/884] Added docs --- docs/en/operations/backup.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 6068b185ede..15d953249a0 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -451,3 +451,24 @@ To disallow concurrent backup/restore, you can use these settings respectively. The default value for both is true, so by default concurrent backup/restores are allowed. When these settings are false on a cluster, only 1 backup/restore is allowed to run on a cluster at a time. + +## Configuring BACKUP/RESTORE to use an AzureBlobStorage Endpoint + +To write backups to an AzureBlobStorage container you need the following pieces of information: +- AzureBlobStorage endpoint connection string / url, +- Container, +- Path, +- Account name (if url is specified) +- Account Key (if url is specified) + +The destination for a backup will be specified like this: +``` +AzureBlobStorage('/', '', '', '', ') +``` + +```sql +BACKUP TABLE data TO AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'data_backup'); +RESTORE TABLE data AS data_restored FROM AzureBlobStorage('DefaultEndpointsProtocol=http;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=http://azurite1:10000/devstoreaccount1/;', + 'test_container', 'data_backup'); +``` From 96c4b6bc35ee818afd2d2963dec7afdb5583969c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 20 Nov 2023 14:41:14 +0100 Subject: [PATCH 018/884] Updated to not analyze create parameterized view for analyzer & old analyzer --- src/Interpreters/InterpreterCreateQuery.cpp | 48 +++++-------------- src/Storages/StorageView.cpp | 3 +- .../0_stateless/02428_parameterized_view.sh | 2 +- 3 files changed, 14 insertions(+), 39 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 747c0be009e..4ee666e2a9a 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -649,6 +649,7 @@ ColumnsDescription InterpreterCreateQuery::getColumnsDescription( if (!attach && !is_restore_from_backup && context_->getSettingsRef().flatten_nested) res.flattenNested(); + if (res.getAllPhysical().empty()) throw Exception(ErrorCodes::EMPTY_LIST_OF_COLUMNS_PASSED, "Cannot CREATE table without physical columns"); @@ -755,49 +756,22 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti { Block as_select_sample; - if (getContext()->getSettingsRef().allow_experimental_analyzer) + if (!create.isParameterizedView()) { - if (create.isParameterizedView()) - { - auto select = create.select->clone(); - - ///Get all query parameters - const auto parameters = analyzeReceiveQueryParamsWithType(select); - NameToNameMap parameter_values; - - for (const auto & parameter : parameters) - { - const auto data_type = DataTypeFactory::instance().get(parameter.second); - /// Todo improve getting default values & include more datatypes - if (data_type->isValueRepresentedByNumber() || parameter.second == "String") - parameter_values[parameter.first] = "1"; - else if (parameter.second.starts_with("Array") || parameter.second.starts_with("Map")) - parameter_values[parameter.first] = "[]"; - else - parameter_values[parameter.first] = " "; - } - - /// Replace with default parameters - ReplaceQueryParameterVisitor visitor(parameter_values); - visitor.visit(select); - - as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(select, getContext()); - } - else + if (getContext()->getSettingsRef().allow_experimental_analyzer) { as_select_sample = InterpreterSelectQueryAnalyzer::getSampleBlock(create.select->clone(), getContext()); } + else + { + as_select_sample = InterpreterSelectWithUnionQuery::getSampleBlock(create.select->clone(), + getContext(), + false /* is_subquery */, + create.isParameterizedView()); + } + properties.columns = ColumnsDescription(as_select_sample.getNamesAndTypesList()); } - else - { - as_select_sample = InterpreterSelectWithUnionQuery::getSampleBlock(create.select->clone(), - getContext(), - false /* is_subquery */, - create.isParameterizedView()); - } - - properties.columns = ColumnsDescription(as_select_sample.getNamesAndTypesList()); } else if (create.as_table_function) { diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index f0f9b9540de..2f7267e3701 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,7 +112,8 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - storage_metadata.setColumns(columns_); + if (is_parameterized_view_ && !query.isParameterizedView()) + storage_metadata.setColumns(columns_); storage_metadata.setComment(comment); if (!query.select) diff --git a/tests/queries/0_stateless/02428_parameterized_view.sh b/tests/queries/0_stateless/02428_parameterized_view.sh index ad9c672f4c5..499b8697ffc 100755 --- a/tests/queries/0_stateless/02428_parameterized_view.sh +++ b/tests/queries/0_stateless/02428_parameterized_view.sh @@ -37,7 +37,7 @@ $CLICKHOUSE_CLIENT -q "CREATE VIEW test_02428_pv1 AS SELECT * FROM test_02428_Ca $CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1(price=20)" $CLICKHOUSE_CLIENT -q "SELECT Price FROM \`test_02428_pv1\`(price=20)" -$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -Fq "UNKNOWN_QUERY_PARAMETER" && echo 'ERROR' || echo 'OK' +$CLICKHOUSE_CLIENT -q "SELECT Price FROM test_02428_pv1" 2>&1 | grep -q "UNKNOWN_QUERY_PARAMETER\|UNKNOWN_IDENTIFIER" && echo 'ERROR' || echo 'OK' $CLICKHOUSE_CLIENT --param_p 10 -q "SELECT Price FROM test_02428_pv1(price={p:UInt64})" $CLICKHOUSE_CLIENT --param_l 1 -q "SELECT Price FROM test_02428_pv1(price=50) LIMIT ({l:UInt64})" From d0827e3ea77ff432c4a6a66145827428bcd62b5e Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 1 Dec 2023 17:45:23 +0000 Subject: [PATCH 019/884] Add a test. --- .../0_stateless/02932_set_ttl_where.reference | 0 .../0_stateless/02932_set_ttl_where.sql | 22 +++++++++++++++++++ 2 files changed, 22 insertions(+) create mode 100644 tests/queries/0_stateless/02932_set_ttl_where.reference create mode 100644 tests/queries/0_stateless/02932_set_ttl_where.sql diff --git a/tests/queries/0_stateless/02932_set_ttl_where.reference b/tests/queries/0_stateless/02932_set_ttl_where.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql new file mode 100644 index 00000000000..85fddf613e8 --- /dev/null +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -0,0 +1,22 @@ +create or replace table temp ( + a UInt32 +) +engine = MergeTree +order by a; + +insert into temp select number from system.numbers limit 100_000; + +create or replace table t_temp ( + a UInt32, + timestamp DateTime +) +engine = MergeTree +order by a +TTL timestamp + INTERVAL 2 SECOND WHERE a in (select a from temp); + +select sleep(1); +insert into t_temp select rand(), now() from system.numbers limit 1_000_000; +select sleep(1); +insert into t_temp select rand(), now() from system.numbers limit 1_000_000; +select sleep(1); +optimize table t_temp final; From 508046e6922c0cb163ce5611f1e6ef6a22f8b7f1 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Fri, 1 Dec 2023 20:31:26 +0000 Subject: [PATCH 020/884] Attempt to support subqueries in TTL. --- src/Interpreters/PreparedSets.cpp | 8 ++- src/Interpreters/PreparedSets.h | 1 + src/Processors/TTL/ITTLAlgorithm.cpp | 5 +- src/Processors/TTL/ITTLAlgorithm.h | 9 ++- .../TTL/TTLAggregationAlgorithm.cpp | 11 ++-- src/Processors/TTL/TTLAggregationAlgorithm.h | 1 + src/Processors/TTL/TTLColumnAlgorithm.cpp | 5 +- src/Processors/TTL/TTLColumnAlgorithm.h | 1 + src/Processors/TTL/TTLDeleteAlgorithm.cpp | 10 +-- src/Processors/TTL/TTLDeleteAlgorithm.h | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp | 5 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.h | 1 + src/Processors/Transforms/TTLTransform.cpp | 2 +- .../MergeTree/MergeTreeDataWriter.cpp | 19 +++--- src/Storages/StorageInMemoryMetadata.cpp | 21 +++---- src/Storages/TTLDescription.cpp | 62 ++++++++++++------- src/Storages/TTLDescription.h | 15 ++++- 17 files changed, 116 insertions(+), 62 deletions(-) diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index 955d8892284..ea8d9a62b8b 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -189,11 +189,17 @@ SetPtr FutureSetFromSubquery::buildOrderedSetInplace(const ContextPtr & context) } } + set_and_key->set->fillSetElements(); + + return buildSetInplace(context); +} + +SetPtr FutureSetFromSubquery::buildSetInplace(const ContextPtr & context) +{ auto plan = build(context); if (!plan) return nullptr; - set_and_key->set->fillSetElements(); auto builder = plan->buildQueryPipeline(QueryPlanOptimizationSettings::fromContext(context), BuildQueryPipelineSettings::fromContext(context)); auto pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); pipeline.complete(std::make_shared(Block())); diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index e237789c63c..3e751d309ba 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -107,6 +107,7 @@ public: SetPtr get() const override; DataTypes getTypes() const override; SetPtr buildOrderedSetInplace(const ContextPtr & context) override; + SetPtr buildSetInplace(const ContextPtr & context); std::unique_ptr build(const ContextPtr & context); diff --git a/src/Processors/TTL/ITTLAlgorithm.cpp b/src/Processors/TTL/ITTLAlgorithm.cpp index 79140137df8..af6c4e4ac35 100644 --- a/src/Processors/TTL/ITTLAlgorithm.cpp +++ b/src/Processors/TTL/ITTLAlgorithm.cpp @@ -11,8 +11,9 @@ namespace ErrorCodes } ITTLAlgorithm::ITTLAlgorithm( - const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : description(description_) + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + : ttl_expressions(ttl_expressions_) + , description(description_) , old_ttl_info(old_ttl_info_) , current_time(current_time_) , force(force_) diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index 49cd2c46d9d..6e73286b564 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -8,6 +8,12 @@ namespace DB { +struct TTlExpressions +{ + ExpressionActionsPtr expression; + ExpressionActionsPtr where_expression; +}; + /** * Represents the actions, which are required to do * with data, when TTL is expired: delete, aggregate, etc. @@ -18,7 +24,7 @@ public: using TTLInfo = IMergeTreeDataPart::TTLInfo; using MutableDataPartPtr = MergeTreeMutableDataPartPtr; - ITTLAlgorithm(const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + ITTLAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); virtual ~ITTLAlgorithm() = default; virtual void execute(Block & block) = 0; @@ -39,6 +45,7 @@ protected: bool isTTLExpired(time_t ttl) const; UInt32 getTimestampByIndex(const IColumn * column, size_t index) const; + const TTlExpressions ttl_expressions; const TTLDescription description; const TTLInfo old_ttl_info; const time_t current_time; diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp index fa3436ec55d..ab2ba5f58fc 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp +++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp @@ -5,13 +5,14 @@ namespace DB { TTLAggregationAlgorithm::TTLAggregationAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_, const Block & header_, const MergeTreeData & storage_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , header(header_) { current_key_value.resize(description.group_by_keys.size()); @@ -73,8 +74,8 @@ void TTLAggregationAlgorithm::execute(Block & block) const auto & column_names = header.getNames(); MutableColumns aggregate_columns = header.cloneEmptyColumns(); - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); size_t rows_aggregated = 0; size_t current_key_start = 0; @@ -145,8 +146,8 @@ void TTLAggregationAlgorithm::execute(Block & block) /// If some rows were aggregated we have to recalculate ttl info's if (some_rows_were_aggregated) { - auto ttl_column_after_aggregation = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column_after_aggregation = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column_after_aggregation = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column_after_aggregation = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); for (size_t i = 0; i < block.rows(); ++i) { bool where_filter_passed = !where_column_after_aggregation || where_column_after_aggregation->getBool(i); diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.h b/src/Processors/TTL/TTLAggregationAlgorithm.h index 0e4bf092ed6..9fd074efba8 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.h +++ b/src/Processors/TTL/TTLAggregationAlgorithm.h @@ -13,6 +13,7 @@ class TTLAggregationAlgorithm final : public ITTLAlgorithm { public: TTLAggregationAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.cpp b/src/Processors/TTL/TTLColumnAlgorithm.cpp index 04c4d7b9348..cb99dcf99b1 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.cpp +++ b/src/Processors/TTL/TTLColumnAlgorithm.cpp @@ -4,6 +4,7 @@ namespace DB { TTLColumnAlgorithm::TTLColumnAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, @@ -12,7 +13,7 @@ TTLColumnAlgorithm::TTLColumnAlgorithm( const ExpressionActionsPtr & default_expression_, const String & default_column_name_, bool is_compact_part_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , column_name(column_name_) , default_expression(default_expression_) , default_column_name(default_column_name_) @@ -49,7 +50,7 @@ void TTLColumnAlgorithm::execute(Block & block) if (default_column) default_column = default_column->convertToFullColumnIfConst(); - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); auto & column_with_type = block.getByName(column_name); const IColumn * values_column = column_with_type.column.get(); diff --git a/src/Processors/TTL/TTLColumnAlgorithm.h b/src/Processors/TTL/TTLColumnAlgorithm.h index 30de77dcc2a..efcd7c74454 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.h +++ b/src/Processors/TTL/TTLColumnAlgorithm.h @@ -11,6 +11,7 @@ class TTLColumnAlgorithm final : public ITTLAlgorithm { public: TTLColumnAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.cpp b/src/Processors/TTL/TTLDeleteAlgorithm.cpp index f176df2d003..6a172e9c3c3 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.cpp +++ b/src/Processors/TTL/TTLDeleteAlgorithm.cpp @@ -4,8 +4,8 @@ namespace DB { TTLDeleteAlgorithm::TTLDeleteAlgorithm( - const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) { if (!isMinTTLExpired()) new_ttl_info = old_ttl_info; @@ -19,8 +19,8 @@ void TTLDeleteAlgorithm::execute(Block & block) if (!block || !isMinTTLExpired()) return; - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); - auto where_column = executeExpressionAndGetColumn(description.where_expression, block, description.where_result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); + auto where_column = executeExpressionAndGetColumn(ttl_expressions.where_expression, block, description.where_result_column); MutableColumns result_columns; const auto & column_names = block.getNames(); @@ -54,7 +54,7 @@ void TTLDeleteAlgorithm::execute(Block & block) void TTLDeleteAlgorithm::finalize(const MutableDataPartPtr & data_part) const { - if (description.where_expression) + if (ttl_expressions.where_expression) data_part->ttl_infos.rows_where_ttl[description.result_column] = new_ttl_info; else data_part->ttl_infos.table_ttl = new_ttl_info; diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.h b/src/Processors/TTL/TTLDeleteAlgorithm.h index 292a29bfa27..23389070774 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.h +++ b/src/Processors/TTL/TTLDeleteAlgorithm.h @@ -10,7 +10,7 @@ namespace DB class TTLDeleteAlgorithm final : public ITTLAlgorithm { public: - TTLDeleteAlgorithm(const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + TTLDeleteAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); void execute(Block & block) override; void finalize(const MutableDataPartPtr & data_part) const override; diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index eba364aa2b8..34c0cad70ea 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -4,13 +4,14 @@ namespace DB { TTLUpdateInfoAlgorithm::TTLUpdateInfoAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) - : ITTLAlgorithm(description_, old_ttl_info_, current_time_, force_) + : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) , ttl_update_field(ttl_update_field_) , ttl_update_key(ttl_update_key_) { @@ -21,7 +22,7 @@ void TTLUpdateInfoAlgorithm::execute(Block & block) if (!block) return; - auto ttl_column = executeExpressionAndGetColumn(description.expression, block, description.result_column); + auto ttl_column = executeExpressionAndGetColumn(ttl_expressions.expression, block, description.result_column); for (size_t i = 0; i < block.rows(); ++i) { UInt32 cur_ttl = ITTLAlgorithm::getTimestampByIndex(ttl_column.get(), i); diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h index 45eecbde3d0..e9bcfcdec88 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h @@ -20,6 +20,7 @@ class TTLUpdateInfoAlgorithm : public ITTLAlgorithm { public: TTLUpdateInfoAlgorithm( + const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index 7cde86098c7..d3d45f68d46 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -36,7 +36,7 @@ TTLTransform::TTLTransform( rows_ttl, old_ttl_infos.table_ttl, current_time_, force_); /// Skip all data if table ttl is expired for part - if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression) + if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression_ast) all_data_dropped = true; delete_algorithm = algorithm.get(); diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index 2a381afa805..d080240b066 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -125,13 +125,18 @@ void buildScatterSelector( /// Computes ttls and updates ttl infos void updateTTL( + const ContextPtr context, const TTLDescription & ttl_entry, IMergeTreeDataPart::TTLInfos & ttl_infos, DB::MergeTreeDataPartTTLInfo & ttl_info, const Block & block, bool update_part_min_max_ttls) { - auto ttl_column = ITTLAlgorithm::executeExpressionAndGetColumn(ttl_entry.expression, block, ttl_entry.result_column); + auto expr_and_set = ttl_entry.buildExpression(); + for (auto & subquery : expr_and_set.sets->getSubqueries()) + subquery->buildSetInplace(context); + + auto ttl_column = ITTLAlgorithm::executeExpressionAndGetColumn(expr_and_set.expression, block, ttl_entry.result_column); if (const ColumnUInt16 * column_date = typeid_cast(ttl_column.get())) { @@ -488,7 +493,7 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( DB::IMergeTreeDataPart::TTLInfos move_ttl_infos; const auto & move_ttl_entries = metadata_snapshot->getMoveTTLs(); for (const auto & ttl_entry : move_ttl_entries) - updateTTL(ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); + updateTTL(context, ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false); ReservationPtr reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true); VolumePtr volume = data.getStoragePolicy()->getVolume(0); @@ -543,20 +548,20 @@ MergeTreeDataWriter::TemporaryPart MergeTreeDataWriter::writeTempPartImpl( } if (metadata_snapshot->hasRowsTTL()) - updateTTL(metadata_snapshot->getRowsTTL(), new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); + updateTTL(context, metadata_snapshot->getRowsTTL(), new_data_part->ttl_infos, new_data_part->ttl_infos.table_ttl, block, true); for (const auto & ttl_entry : metadata_snapshot->getGroupByTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.group_by_ttl[ttl_entry.result_column], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.group_by_ttl[ttl_entry.result_column], block, true); for (const auto & ttl_entry : metadata_snapshot->getRowsWhereTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.rows_where_ttl[ttl_entry.result_column], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.rows_where_ttl[ttl_entry.result_column], block, true); for (const auto & [name, ttl_entry] : metadata_snapshot->getColumnTTLs()) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.columns_ttl[name], block, true); const auto & recompression_ttl_entries = metadata_snapshot->getRecompressionTTLs(); for (const auto & ttl_entry : recompression_ttl_entries) - updateTTL(ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.recompression_ttl[ttl_entry.result_column], block, false); + updateTTL(context, ttl_entry, new_data_part->ttl_infos, new_data_part->ttl_infos.recompression_ttl[ttl_entry.result_column], block, false); new_data_part->ttl_infos.update(move_ttl_infos); diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index af285a953dc..7db5af82e0b 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -193,7 +193,7 @@ TTLDescription StorageInMemoryMetadata::getRowsTTL() const bool StorageInMemoryMetadata::hasRowsTTL() const { - return table_ttl.rows_ttl.expression != nullptr; + return table_ttl.rows_ttl.expression_ast != nullptr; } TTLDescriptions StorageInMemoryMetadata::getRowsWhereTTLs() const @@ -251,9 +251,8 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( NameSet required_ttl_columns; NameSet updated_ttl_columns; - auto add_dependent_columns = [&updated_columns](const auto & expression, auto & to_set) + auto add_dependent_columns = [&updated_columns](const Names & required_columns, auto & to_set) { - auto required_columns = expression->getRequiredColumns(); for (const auto & dependency : required_columns) { if (updated_columns.contains(dependency)) @@ -269,13 +268,13 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( for (const auto & index : getSecondaryIndices()) { if (has_dependency(index.name, ColumnDependency::SKIP_INDEX)) - add_dependent_columns(index.expression, indices_columns); + add_dependent_columns(index.expression->getRequiredColumns(), indices_columns); } for (const auto & projection : getProjections()) { if (has_dependency(projection.name, ColumnDependency::PROJECTION)) - add_dependent_columns(&projection, projections_columns); + add_dependent_columns(projection.getRequiredColumns(), projections_columns); } auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) @@ -289,25 +288,25 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( }; if (hasRowsTTL()) - add_for_rows_ttl(getRowsTTL().expression, required_ttl_columns); + add_for_rows_ttl(getRowsTTL().expression_columns, required_ttl_columns); for (const auto & entry : getRowsWhereTTLs()) - add_for_rows_ttl(entry.expression, required_ttl_columns); + add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getGroupByTTLs()) - add_for_rows_ttl(entry.expression, required_ttl_columns); + add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getRecompressionTTLs()) - add_dependent_columns(entry.expression, required_ttl_columns); + add_dependent_columns(entry.expression_columns, required_ttl_columns); for (const auto & [name, entry] : getColumnTTLs()) { - if (add_dependent_columns(entry.expression, required_ttl_columns) && include_ttl_target) + if (add_dependent_columns(entry.expression_columns, required_ttl_columns) && include_ttl_target) updated_ttl_columns.insert(name); } for (const auto & entry : getMoveTTLs()) - add_dependent_columns(entry.expression, required_ttl_columns); + add_dependent_columns(entry.expression_columns, required_ttl_columns); //TODO what about rows_where_ttl and group_by_ttl ?? diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index f601fed06ac..47138f30e4f 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -113,11 +113,11 @@ TTLDescription::TTLDescription(const TTLDescription & other) , if_exists(other.if_exists) , recompression_codec(other.recompression_codec) { - if (other.expression) - expression = other.expression->clone(); + // if (other.expression) + // expression = other.expression->clone(); - if (other.where_expression) - where_expression = other.where_expression->clone(); + // if (other.where_expression) + // where_expression = other.where_expression->clone(); } TTLDescription & TTLDescription::operator=(const TTLDescription & other) @@ -131,16 +131,16 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else expression_ast.reset(); - if (other.expression) - expression = other.expression->clone(); - else - expression.reset(); + // if (other.expression) + // expression = other.expression->clone(); + // else + // expression.reset(); result_column = other.result_column; - if (other.where_expression) - where_expression = other.where_expression->clone(); - else - where_expression.reset(); + // if (other.where_expression) + // where_expression = other.where_expression->clone(); + // else + // where_expression.reset(); where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; @@ -158,6 +158,17 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) return * this; } +static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndTypesList & columns, const ContextPtr & context) +{ + ExpressionAndSets result; + auto syntax_analyzer_result = TreeRewriter(context).analyze(ast, columns); + ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context); + result.expression = analyzer.getActions(false); + result.sets = analyzer.getPreparedSets(); + + return result; +} + TTLDescription TTLDescription::getTTLFromAST( const ASTPtr & definition_ast, const ColumnsDescription & columns, @@ -174,10 +185,15 @@ TTLDescription TTLDescription::getTTLFromAST( result.expression_ast = definition_ast->clone(); auto ttl_ast = result.expression_ast->clone(); - auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); - result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); + auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; + result.expression_columns = expression->getRequiredColumns(); + + // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); + // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); result.result_column = ttl_ast->getColumnName(); + ExpressionActionsPtr where_expression; + if (ttl_element == nullptr) /// columns TTL { result.destination_type = DataDestinationType::DELETE; @@ -194,8 +210,10 @@ TTLDescription TTLDescription::getTTLFromAST( { if (ASTPtr where_expr_ast = ttl_element->where()) { - auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); - result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); + result.where_expression_ast = where_expr_ast->clone(); + where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; + // auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); + // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); result.where_result_column = where_expr_ast->getColumnName(); } } @@ -221,17 +239,17 @@ TTLDescription TTLDescription::getTTLFromAST( for (const auto & ast : ttl_element->group_by_assignments) { const auto assignment = ast->as(); - auto expression = assignment.expression(); + auto ass_expression = assignment.expression(); FindAggregateFunctionVisitor::Data data{false}; - FindAggregateFunctionVisitor(data).visit(expression); + FindAggregateFunctionVisitor(data).visit(ass_expression); if (!data.has_aggregate_function) throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, "Invalid expression for assignment of column {}. Should contain an aggregate function", assignment.column_name); - expression = addTypeConversionToAST(std::move(expression), columns.getPhysical(assignment.column_name).type->getName()); - aggregations.emplace_back(assignment.column_name, std::move(expression)); + ass_expression = addTypeConversionToAST(std::move(ass_expression), columns.getPhysical(assignment.column_name).type->getName()); + aggregations.emplace_back(assignment.column_name, std::move(ass_expression)); aggregation_columns_set.insert(assignment.column_name); } @@ -289,7 +307,7 @@ TTLDescription TTLDescription::getTTLFromAST( } } - checkTTLExpression(result.expression, result.result_column); + checkTTLExpression(expression, result.result_column); return result; } @@ -341,7 +359,7 @@ TTLTableDescription TTLTableDescription::getTTLForTableFromAST( auto ttl = TTLDescription::getTTLFromAST(ttl_element_ptr, columns, context, primary_key); if (ttl.mode == TTLMode::DELETE) { - if (!ttl.where_expression) + if (!ttl.where_expression_ast) { if (have_unconditional_delete_ttl) throw Exception(ErrorCodes::BAD_TTL_EXPRESSION, "More than one DELETE TTL expression without WHERE expression is not allowed"); diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index 8f60eb604b5..5ea243424cb 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -33,6 +33,15 @@ struct TTLAggregateDescription using TTLAggregateDescriptions = std::vector; +class PreparedSets; +using PreparedSetsPtr = std::shared_ptr; + +struct ExpressionAndSets +{ + ExpressionActionsPtr expression; + PreparedSetsPtr sets; +}; + /// Common struct for TTL record in storage struct TTLDescription { @@ -42,9 +51,10 @@ struct TTLDescription /// TTL d + INTERVAL 1 DAY /// ^~~~~~~~~~~~~~~~~~~^ ASTPtr expression_ast; + Names expression_columns; /// Expression actions evaluated from AST - ExpressionActionsPtr expression; + ExpressionAndSets buildExpression() const; /// Result column of this TTL expression String result_column; @@ -52,7 +62,8 @@ struct TTLDescription /// WHERE part in TTL expression /// TTL ... WHERE x % 10 == 0 and y > 5 /// ^~~~~~~~~~~~~~~~~~~~~~^ - ExpressionActionsPtr where_expression; + ASTPtr where_expression_ast; + ExpressionAndSets buildWhereExpression() const; /// Name of result column from WHERE expression String where_result_column; From 7ab4af06df0d78e6728e3cc5c727e5c9e4cc33ef Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 4 Dec 2023 18:04:42 +0000 Subject: [PATCH 021/884] Attempt to support subqueries in TTL. (2) --- src/Processors/QueryPlan/CreatingSetsStep.cpp | 29 +++++++++++ src/Processors/QueryPlan/CreatingSetsStep.h | 2 + src/Processors/TTL/ITTLAlgorithm.cpp | 2 +- src/Processors/TTL/ITTLAlgorithm.h | 6 +-- .../TTL/TTLAggregationAlgorithm.cpp | 2 +- src/Processors/TTL/TTLAggregationAlgorithm.h | 2 +- src/Processors/TTL/TTLColumnAlgorithm.cpp | 2 +- src/Processors/TTL/TTLColumnAlgorithm.h | 2 +- src/Processors/TTL/TTLDeleteAlgorithm.cpp | 2 +- src/Processors/TTL/TTLDeleteAlgorithm.h | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp | 2 +- src/Processors/TTL/TTLUpdateInfoAlgorithm.h | 2 +- .../Transforms/TTLCalcTransform.cpp | 33 ++++++++++--- src/Processors/Transforms/TTLCalcTransform.h | 4 ++ src/Processors/Transforms/TTLTransform.cpp | 33 ++++++++++--- src/Processors/Transforms/TTLTransform.h | 5 ++ src/Storages/MergeTree/MergeTask.cpp | 36 +++++++++----- .../MergeTree/MergeTreeDataWriter.cpp | 2 +- src/Storages/MergeTree/MutateTask.cpp | 49 ++++++++++++++----- src/Storages/StorageInMemoryMetadata.cpp | 8 +-- src/Storages/TTLDescription.cpp | 21 +++++++- src/Storages/TTLDescription.h | 7 +-- 22 files changed, 197 insertions(+), 56 deletions(-) diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 3e4dfb0c7d1..11415e8d815 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -157,6 +157,35 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::Subqueries subque query_plan.unitePlans(std::move(creating_sets), std::move(plans)); } +QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipeline, PreparedSets::Subqueries subqueries, ContextPtr context) +{ + DataStreams input_streams; + input_streams.emplace_back(DataStream{pipeline->getHeader()}); + + QueryPipelineBuilders pipelines; + pipelines.reserve(1 + subqueries.size()); + pipelines.push_back(std::move(pipeline)); + + auto plan_settings = QueryPlanOptimizationSettings::fromContext(context); + auto pipeline_settings = BuildQueryPipelineSettings::fromContext(context); + + for (auto & future_set : subqueries) + { + if (future_set->get()) + continue; + + auto plan = future_set->build(context); + if (!plan) + continue; + + input_streams.emplace_back(plan->getCurrentDataStream()); + pipelines.emplace_back(plan->buildQueryPipeline(plan_settings, pipeline_settings)); + } + + CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); + return std::move(pipelines.front()); +} + std::vector> DelayedCreatingSetsStep::makePlansForSets(DelayedCreatingSetsStep && step) { std::vector> plans; diff --git a/src/Processors/QueryPlan/CreatingSetsStep.h b/src/Processors/QueryPlan/CreatingSetsStep.h index a90b70a2fa4..292ec19914c 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.h +++ b/src/Processors/QueryPlan/CreatingSetsStep.h @@ -72,4 +72,6 @@ void addCreatingSetsStep(QueryPlan & query_plan, PreparedSets::Subqueries subque void addCreatingSetsStep(QueryPlan & query_plan, PreparedSetsPtr prepared_sets, ContextPtr context); +QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipeline, PreparedSets::Subqueries subqueries, ContextPtr context); + } diff --git a/src/Processors/TTL/ITTLAlgorithm.cpp b/src/Processors/TTL/ITTLAlgorithm.cpp index af6c4e4ac35..761f43e2422 100644 --- a/src/Processors/TTL/ITTLAlgorithm.cpp +++ b/src/Processors/TTL/ITTLAlgorithm.cpp @@ -11,7 +11,7 @@ namespace ErrorCodes } ITTLAlgorithm::ITTLAlgorithm( - const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) : ttl_expressions(ttl_expressions_) , description(description_) , old_ttl_info(old_ttl_info_) diff --git a/src/Processors/TTL/ITTLAlgorithm.h b/src/Processors/TTL/ITTLAlgorithm.h index 6e73286b564..d79aa8a8dfc 100644 --- a/src/Processors/TTL/ITTLAlgorithm.h +++ b/src/Processors/TTL/ITTLAlgorithm.h @@ -8,7 +8,7 @@ namespace DB { -struct TTlExpressions +struct TTLExpressions { ExpressionActionsPtr expression; ExpressionActionsPtr where_expression; @@ -24,7 +24,7 @@ public: using TTLInfo = IMergeTreeDataPart::TTLInfo; using MutableDataPartPtr = MergeTreeMutableDataPartPtr; - ITTLAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + ITTLAlgorithm(const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); virtual ~ITTLAlgorithm() = default; virtual void execute(Block & block) = 0; @@ -45,7 +45,7 @@ protected: bool isTTLExpired(time_t ttl) const; UInt32 getTimestampByIndex(const IColumn * column, size_t index) const; - const TTlExpressions ttl_expressions; + const TTLExpressions ttl_expressions; const TTLDescription description; const TTLInfo old_ttl_info; const time_t current_time; diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.cpp b/src/Processors/TTL/TTLAggregationAlgorithm.cpp index ab2ba5f58fc..0c6184a56e5 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.cpp +++ b/src/Processors/TTL/TTLAggregationAlgorithm.cpp @@ -5,7 +5,7 @@ namespace DB { TTLAggregationAlgorithm::TTLAggregationAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLAggregationAlgorithm.h b/src/Processors/TTL/TTLAggregationAlgorithm.h index 9fd074efba8..f7bf19a202b 100644 --- a/src/Processors/TTL/TTLAggregationAlgorithm.h +++ b/src/Processors/TTL/TTLAggregationAlgorithm.h @@ -13,7 +13,7 @@ class TTLAggregationAlgorithm final : public ITTLAlgorithm { public: TTLAggregationAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.cpp b/src/Processors/TTL/TTLColumnAlgorithm.cpp index cb99dcf99b1..e27050564ce 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.cpp +++ b/src/Processors/TTL/TTLColumnAlgorithm.cpp @@ -4,7 +4,7 @@ namespace DB { TTLColumnAlgorithm::TTLColumnAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLColumnAlgorithm.h b/src/Processors/TTL/TTLColumnAlgorithm.h index efcd7c74454..f34dae952d1 100644 --- a/src/Processors/TTL/TTLColumnAlgorithm.h +++ b/src/Processors/TTL/TTLColumnAlgorithm.h @@ -11,7 +11,7 @@ class TTLColumnAlgorithm final : public ITTLAlgorithm { public: TTLColumnAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.cpp b/src/Processors/TTL/TTLDeleteAlgorithm.cpp index 6a172e9c3c3..6f9bc315276 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.cpp +++ b/src/Processors/TTL/TTLDeleteAlgorithm.cpp @@ -4,7 +4,7 @@ namespace DB { TTLDeleteAlgorithm::TTLDeleteAlgorithm( - const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_) : ITTLAlgorithm(ttl_expressions_, description_, old_ttl_info_, current_time_, force_) { if (!isMinTTLExpired()) diff --git a/src/Processors/TTL/TTLDeleteAlgorithm.h b/src/Processors/TTL/TTLDeleteAlgorithm.h index 23389070774..622e45acecb 100644 --- a/src/Processors/TTL/TTLDeleteAlgorithm.h +++ b/src/Processors/TTL/TTLDeleteAlgorithm.h @@ -10,7 +10,7 @@ namespace DB class TTLDeleteAlgorithm final : public ITTLAlgorithm { public: - TTLDeleteAlgorithm(const TTlExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); + TTLDeleteAlgorithm(const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLInfo & old_ttl_info_, time_t current_time_, bool force_); void execute(Block & block) override; void finalize(const MutableDataPartPtr & data_part) const override; diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp index 34c0cad70ea..b7cddf3c165 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.cpp @@ -4,7 +4,7 @@ namespace DB { TTLUpdateInfoAlgorithm::TTLUpdateInfoAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h index e9bcfcdec88..0cf31765aef 100644 --- a/src/Processors/TTL/TTLUpdateInfoAlgorithm.h +++ b/src/Processors/TTL/TTLUpdateInfoAlgorithm.h @@ -20,7 +20,7 @@ class TTLUpdateInfoAlgorithm : public ITTLAlgorithm { public: TTLUpdateInfoAlgorithm( - const TTlExpressions & ttl_expressions_, + const TTLExpressions & ttl_expressions_, const TTLDescription & description_, const TTLUpdateField ttl_update_field_, const String ttl_update_key_, diff --git a/src/Processors/Transforms/TTLCalcTransform.cpp b/src/Processors/Transforms/TTLCalcTransform.cpp index 31fb61239ef..204dfe21733 100644 --- a/src/Processors/Transforms/TTLCalcTransform.cpp +++ b/src/Processors/Transforms/TTLCalcTransform.cpp @@ -4,7 +4,22 @@ namespace DB { +static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) +{ + auto expr = ttl_descr.buildExpression(context); + auto where_expr = ttl_descr.buildWhereExpression(context); + + auto expr_queries = expr.sets->getSubqueries(); + auto where_expr_queries = expr.sets->getSubqueries(); + + subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + return {expr.expression, where_expr.expression}; +} + TTLCalcTransform::TTLCalcTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -21,33 +36,39 @@ TTLCalcTransform::TTLCalcTransform( { const auto & rows_ttl = metadata_snapshot_->getRowsTTL(); algorithms.emplace_back(std::make_unique( - rows_ttl, TTLUpdateField::TABLE_TTL, rows_ttl.result_column, old_ttl_infos.table_ttl, current_time_, force_)); + getExpressions(rows_ttl, subqueries_for_sets, context), rows_ttl, + TTLUpdateField::TABLE_TTL, rows_ttl.result_column, old_ttl_infos.table_ttl, current_time_, force_)); } for (const auto & where_ttl : metadata_snapshot_->getRowsWhereTTLs()) algorithms.emplace_back(std::make_unique( - where_ttl, TTLUpdateField::ROWS_WHERE_TTL, where_ttl.result_column, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); + getExpressions(where_ttl, subqueries_for_sets, context), where_ttl, + TTLUpdateField::ROWS_WHERE_TTL, where_ttl.result_column, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs()) algorithms.emplace_back(std::make_unique( - group_by_ttl, TTLUpdateField::GROUP_BY_TTL, group_by_ttl.result_column, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_)); + getExpressions(group_by_ttl, subqueries_for_sets, context), group_by_ttl, + TTLUpdateField::GROUP_BY_TTL, group_by_ttl.result_column, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_)); if (metadata_snapshot_->hasAnyColumnTTL()) { for (const auto & [name, description] : metadata_snapshot_->getColumnTTLs()) { algorithms.emplace_back(std::make_unique( - description, TTLUpdateField::COLUMNS_TTL, name, old_ttl_infos.columns_ttl[name], current_time_, force_)); + getExpressions(description, subqueries_for_sets, context), description, + TTLUpdateField::COLUMNS_TTL, name, old_ttl_infos.columns_ttl[name], current_time_, force_)); } } for (const auto & move_ttl : metadata_snapshot_->getMoveTTLs()) algorithms.emplace_back(std::make_unique( - move_ttl, TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); + getExpressions(move_ttl, subqueries_for_sets, context), move_ttl, + TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); for (const auto & recompression_ttl : metadata_snapshot_->getRecompressionTTLs()) algorithms.emplace_back(std::make_unique( - recompression_ttl, TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); + getExpressions(recompression_ttl, subqueries_for_sets, context), recompression_ttl, + TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); } void TTLCalcTransform::consume(Chunk chunk) diff --git a/src/Processors/Transforms/TTLCalcTransform.h b/src/Processors/Transforms/TTLCalcTransform.h index 495879400dc..960438f5f2b 100644 --- a/src/Processors/Transforms/TTLCalcTransform.h +++ b/src/Processors/Transforms/TTLCalcTransform.h @@ -15,6 +15,7 @@ class TTLCalcTransform : public IAccumulatingTransform { public: TTLCalcTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -23,6 +24,8 @@ public: bool force_ ); + PreparedSets::Subqueries getSubqueries() { return std::move(subqueries_for_sets); } + String getName() const override { return "TTL_CALC"; } Status prepare() override; @@ -35,6 +38,7 @@ protected: private: std::vector algorithms; + PreparedSets::Subqueries subqueries_for_sets; /// ttl_infos and empty_columns are updating while reading const MergeTreeData::MutableDataPartPtr & data_part; diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index d3d45f68d46..69e2e6e5fc0 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -16,7 +16,22 @@ namespace DB { +static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) +{ + auto expr = ttl_descr.buildExpression(context); + auto where_expr = ttl_descr.buildWhereExpression(context); + + auto expr_queries = expr.sets->getSubqueries(); + auto where_expr_queries = expr.sets->getSubqueries(); + + subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + return {expr.expression, where_expr.expression}; +} + TTLTransform::TTLTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -33,7 +48,8 @@ TTLTransform::TTLTransform( { const auto & rows_ttl = metadata_snapshot_->getRowsTTL(); auto algorithm = std::make_unique( - rows_ttl, old_ttl_infos.table_ttl, current_time_, force_); + getExpressions(rows_ttl, subqueries_for_sets, context), rows_ttl, + old_ttl_infos.table_ttl, current_time_, force_); /// Skip all data if table ttl is expired for part if (algorithm->isMaxTTLExpired() && !rows_ttl.where_expression_ast) @@ -45,11 +61,13 @@ TTLTransform::TTLTransform( for (const auto & where_ttl : metadata_snapshot_->getRowsWhereTTLs()) algorithms.emplace_back(std::make_unique( - where_ttl, old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); + getExpressions(where_ttl, subqueries_for_sets, context), where_ttl, + old_ttl_infos.rows_where_ttl[where_ttl.result_column], current_time_, force_)); for (const auto & group_by_ttl : metadata_snapshot_->getGroupByTTLs()) algorithms.emplace_back(std::make_unique( - group_by_ttl, old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, + getExpressions(group_by_ttl, subqueries_for_sets, context), group_by_ttl, + old_ttl_infos.group_by_ttl[group_by_ttl.result_column], current_time_, force_, getInputPort().getHeader(), storage_)); if (metadata_snapshot_->hasAnyColumnTTL()) @@ -75,18 +93,21 @@ TTLTransform::TTLTransform( } algorithms.emplace_back(std::make_unique( - description, old_ttl_infos.columns_ttl[name], current_time_, + getExpressions(description, subqueries_for_sets, context), description, + old_ttl_infos.columns_ttl[name], current_time_, force_, name, default_expression, default_column_name, isCompactPart(data_part))); } } for (const auto & move_ttl : metadata_snapshot_->getMoveTTLs()) algorithms.emplace_back(std::make_unique( - move_ttl, TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); + getExpressions(move_ttl, subqueries_for_sets, context), move_ttl, + TTLUpdateField::MOVES_TTL, move_ttl.result_column, old_ttl_infos.moves_ttl[move_ttl.result_column], current_time_, force_)); for (const auto & recompression_ttl : metadata_snapshot_->getRecompressionTTLs()) algorithms.emplace_back(std::make_unique( - recompression_ttl, TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); + getExpressions(recompression_ttl, subqueries_for_sets, context), recompression_ttl, + TTLUpdateField::RECOMPRESSION_TTL, recompression_ttl.result_column, old_ttl_infos.recompression_ttl[recompression_ttl.result_column], current_time_, force_)); } Block reorderColumns(Block block, const Block & header) diff --git a/src/Processors/Transforms/TTLTransform.h b/src/Processors/Transforms/TTLTransform.h index 3f0dffd1998..47da456a2e3 100644 --- a/src/Processors/Transforms/TTLTransform.h +++ b/src/Processors/Transforms/TTLTransform.h @@ -16,6 +16,7 @@ class TTLTransform : public IAccumulatingTransform { public: TTLTransform( + const ContextPtr & context, const Block & header_, const MergeTreeData & storage_, const StorageMetadataPtr & metadata_snapshot_, @@ -28,6 +29,8 @@ public: Status prepare() override; + PreparedSets::Subqueries getSubqueries() { return std::move(subqueries_for_sets); } + protected: void consume(Chunk chunk) override; Chunk generate() override; @@ -40,6 +43,8 @@ private: const TTLDeleteAlgorithm * delete_algorithm = nullptr; bool all_data_dropped = false; + PreparedSets::Subqueries subqueries_for_sets; + /// ttl_infos and empty_columns are updating while reading const MergeTreeData::MutableDataPartPtr & data_part; Poco::Logger * log; diff --git a/src/Storages/MergeTree/MergeTask.cpp b/src/Storages/MergeTree/MergeTask.cpp index e8e307bb148..26b290d33d5 100644 --- a/src/Storages/MergeTree/MergeTask.cpp +++ b/src/Storages/MergeTree/MergeTask.cpp @@ -31,6 +31,9 @@ #include #include #include +#include +#include +#include namespace DB { @@ -1004,8 +1007,9 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() break; } - auto res_pipe = Pipe::unitePipes(std::move(pipes)); - res_pipe.addTransform(std::move(merged_transform)); + auto builder = std::make_unique(); + builder->init(Pipe::unitePipes(std::move(pipes))); + builder->addTransform(std::move(merged_transform)); if (global_ctx->deduplicate) { @@ -1021,26 +1025,34 @@ void MergeTask::ExecuteAndFinalizeHorizontalPart::createMergedStream() } if (DistinctSortedTransform::isApplicable(header, sort_description, global_ctx->deduplicate_by_columns)) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + builder->addTransform(std::make_shared( + builder->getHeader(), sort_description, SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); else - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); + builder->addTransform(std::make_shared( + builder->getHeader(), SizeLimits(), 0 /*limit_hint*/, global_ctx->deduplicate_by_columns)); } + PreparedSets::Subqueries subqueries; + if (ctx->need_remove_expired_values) - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl)); + { + auto transform = std::make_shared(global_ctx->context, builder->getHeader(), *global_ctx->data, global_ctx->metadata_snapshot, global_ctx->new_data_part, global_ctx->time_of_merge, ctx->force_ttl); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (global_ctx->metadata_snapshot->hasSecondaryIndices()) { const auto & indices = global_ctx->metadata_snapshot->getSecondaryIndices(); - res_pipe.addTransform(std::make_shared( - res_pipe.getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); - res_pipe.addTransform(std::make_shared(res_pipe.getHeader())); + builder->addTransform(std::make_shared( + builder->getHeader(), indices.getSingleExpressionForIndices(global_ctx->metadata_snapshot->getColumns(), global_ctx->data->getContext()))); + builder->addTransform(std::make_shared(builder->getHeader())); } - global_ctx->merged_pipeline = QueryPipeline(std::move(res_pipe)); + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), global_ctx->context); + + global_ctx->merged_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); /// Dereference unique_ptr and pass horizontal_stage_progress by reference global_ctx->merged_pipeline.setProgressCallback(MergeProgressCallback(global_ctx->merge_list_element_ptr, global_ctx->watch_prev_elapsed, *global_ctx->horizontal_stage_progress)); /// Is calculated inside MergeProgressCallback. diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp index d080240b066..ce9e5762cb4 100644 --- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp +++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp @@ -132,7 +132,7 @@ void updateTTL( const Block & block, bool update_part_min_max_ttls) { - auto expr_and_set = ttl_entry.buildExpression(); + auto expr_and_set = ttl_entry.buildExpression(context); for (auto & subquery : expr_and_set.sets->getSubqueries()) subquery->buildSetInplace(context); diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 6b6b5947581..61849f94e44 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -16,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -1507,21 +1509,34 @@ private: if (!ctx->mutating_pipeline_builder.initialized()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot mutate part columns with uninitialized mutations stream. It's a bug"); - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + auto builder = std::make_unique(std::move(ctx->mutating_pipeline_builder)); if (ctx->metadata_snapshot->hasPrimaryKey() || ctx->metadata_snapshot->hasSecondaryIndices()) { - builder.addTransform(std::make_shared( - builder.getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); + builder->addTransform(std::make_shared( + builder->getHeader(), ctx->data->getPrimaryKeyAndSkipIndicesExpression(ctx->metadata_snapshot, skip_indices))); - builder.addTransform(std::make_shared(builder.getHeader())); + builder->addTransform(std::make_shared(builder->getHeader())); } + PreparedSets::Subqueries subqueries; + if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } + + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), ctx->context); ctx->minmax_idx = std::make_shared(); @@ -1537,7 +1552,7 @@ private: /*blocks_are_granules_size=*/ false, ctx->context->getWriteSettings()); - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); /// Is calculated inside MergeProgressCallback. ctx->mutating_pipeline.disableProfileEventUpdate(); @@ -1712,13 +1727,25 @@ private: if (ctx->mutating_pipeline_builder.initialized()) { - QueryPipelineBuilder builder(std::move(ctx->mutating_pipeline_builder)); + auto builder = std::make_unique(std::move(ctx->mutating_pipeline_builder)); + PreparedSets::Subqueries subqueries; if (ctx->execute_ttl_type == ExecuteTTLType::NORMAL) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } if (ctx->execute_ttl_type == ExecuteTTLType::RECALCULATE) - builder.addTransform(std::make_shared(builder.getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true)); + { + auto transform = std::make_shared(ctx->context, builder->getHeader(), *ctx->data, ctx->metadata_snapshot, ctx->new_data_part, ctx->time_of_mutation, true); + subqueries = transform->getSubqueries(); + builder->addTransform(std::move(transform)); + } + + if (!subqueries.empty()) + builder = addCreatingSetsTransform(std::move(builder), std::move(subqueries), ctx->context); ctx->out = std::make_shared( ctx->new_data_part, @@ -1732,7 +1759,7 @@ private: &ctx->source_part->index_granularity_info ); - ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(builder)); + ctx->mutating_pipeline = QueryPipelineBuilder::getPipeline(std::move(*builder)); ctx->mutating_pipeline.setProgressCallback(ctx->progress_callback); /// Is calculated inside MergeProgressCallback. ctx->mutating_pipeline.disableProfileEventUpdate(); diff --git a/src/Storages/StorageInMemoryMetadata.cpp b/src/Storages/StorageInMemoryMetadata.cpp index 7db5af82e0b..158c13b653d 100644 --- a/src/Storages/StorageInMemoryMetadata.cpp +++ b/src/Storages/StorageInMemoryMetadata.cpp @@ -279,7 +279,7 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( auto add_for_rows_ttl = [&](const auto & expression, auto & to_set) { - if (add_dependent_columns(expression, to_set) && include_ttl_target) + if (add_dependent_columns(expression.getNames(), to_set) && include_ttl_target) { /// Filter all columns, if rows TTL expression have to be recalculated. for (const auto & column : getColumns().getAllPhysical()) @@ -297,16 +297,16 @@ ColumnDependencies StorageInMemoryMetadata::getColumnDependencies( add_for_rows_ttl(entry.expression_columns, required_ttl_columns); for (const auto & entry : getRecompressionTTLs()) - add_dependent_columns(entry.expression_columns, required_ttl_columns); + add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns); for (const auto & [name, entry] : getColumnTTLs()) { - if (add_dependent_columns(entry.expression_columns, required_ttl_columns) && include_ttl_target) + if (add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns) && include_ttl_target) updated_ttl_columns.insert(name); } for (const auto & entry : getMoveTTLs()) - add_dependent_columns(entry.expression_columns, required_ttl_columns); + add_dependent_columns(entry.expression_columns.getNames(), required_ttl_columns); //TODO what about rows_where_ttl and group_by_ttl ?? diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index 47138f30e4f..e02ac933028 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -169,6 +169,23 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType return result; } +ExpressionAndSets TTLDescription::buildExpression(const ContextPtr & context) const +{ + auto ast = expression_ast->clone(); + return buildExpressionAndSets(ast, expression_columns, context); +} + +ExpressionAndSets TTLDescription::buildWhereExpression(const ContextPtr & context) const +{ + if (where_expression_ast) + { + auto ast = where_expression_ast->clone(); + return buildExpressionAndSets(ast, where_expression_columns, context); + } + + return {}; +} + TTLDescription TTLDescription::getTTLFromAST( const ASTPtr & definition_ast, const ColumnsDescription & columns, @@ -186,7 +203,7 @@ TTLDescription TTLDescription::getTTLFromAST( auto ttl_ast = result.expression_ast->clone(); auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; - result.expression_columns = expression->getRequiredColumns(); + result.expression_columns = expression->getRequiredColumnsWithTypes(); // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); @@ -214,6 +231,8 @@ TTLDescription TTLDescription::getTTLFromAST( where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; // auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); + + result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); result.where_result_column = where_expr_ast->getColumnName(); } } diff --git a/src/Storages/TTLDescription.h b/src/Storages/TTLDescription.h index 5ea243424cb..7dfc736ded2 100644 --- a/src/Storages/TTLDescription.h +++ b/src/Storages/TTLDescription.h @@ -51,10 +51,10 @@ struct TTLDescription /// TTL d + INTERVAL 1 DAY /// ^~~~~~~~~~~~~~~~~~~^ ASTPtr expression_ast; - Names expression_columns; + NamesAndTypesList expression_columns; /// Expression actions evaluated from AST - ExpressionAndSets buildExpression() const; + ExpressionAndSets buildExpression(const ContextPtr & context) const; /// Result column of this TTL expression String result_column; @@ -63,7 +63,8 @@ struct TTLDescription /// TTL ... WHERE x % 10 == 0 and y > 5 /// ^~~~~~~~~~~~~~~~~~~~~~^ ASTPtr where_expression_ast; - ExpressionAndSets buildWhereExpression() const; + NamesAndTypesList where_expression_columns; + ExpressionAndSets buildWhereExpression(const ContextPtr & context) const; /// Name of result column from WHERE expression String where_result_column; From 16558ccc840d7a15efb2ab0fe691a79c38dd5086 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 4 Dec 2023 18:13:34 +0000 Subject: [PATCH 022/884] Fix some tests --- src/Storages/TTLDescription.cpp | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index e02ac933028..e32ff11860b 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -103,7 +103,10 @@ using FindAggregateFunctionVisitor = InDepthNodeVisitorclone() : nullptr) + , expression_columns(other.expression_columns) , result_column(other.result_column) + , where_expression_ast(other.where_expression_ast ? other.where_expression_ast->clone() : nullptr) + , where_expression_columns(other.where_expression_columns) , where_result_column(other.where_result_column) , group_by_keys(other.group_by_keys) , set_parts(other.set_parts) @@ -136,12 +139,20 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) // else // expression.reset(); + expression_columns = other.expression_columns; result_column = other.result_column; + + if (other.where_expression_ast) + where_expression_ast = other.where_expression_ast->clone(); + else + where_expression_ast.reset(); + // if (other.where_expression) // where_expression = other.where_expression->clone(); // else // where_expression.reset(); + where_expression_columns = other.where_expression_columns; where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; set_parts = other.set_parts; From 6a821f9e737373b28bc98f25e10439dd04e7bdb8 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Mon, 4 Dec 2023 19:24:27 +0000 Subject: [PATCH 023/884] Fix some staff --- src/Processors/QueryPlan/CreatingSetsStep.cpp | 3 +-- src/Processors/Transforms/TTLCalcTransform.cpp | 12 +++++++----- src/Processors/Transforms/TTLTransform.cpp | 12 +++++++----- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/Processors/QueryPlan/CreatingSetsStep.cpp b/src/Processors/QueryPlan/CreatingSetsStep.cpp index 11415e8d815..f13a717004f 100644 --- a/src/Processors/QueryPlan/CreatingSetsStep.cpp +++ b/src/Processors/QueryPlan/CreatingSetsStep.cpp @@ -182,8 +182,7 @@ QueryPipelineBuilderPtr addCreatingSetsTransform(QueryPipelineBuilderPtr pipelin pipelines.emplace_back(plan->buildQueryPipeline(plan_settings, pipeline_settings)); } - CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); - return std::move(pipelines.front()); + return CreatingSetsStep(input_streams).updatePipeline(std::move(pipelines), pipeline_settings); } std::vector> DelayedCreatingSetsStep::makePlansForSets(DelayedCreatingSetsStep && step) diff --git a/src/Processors/Transforms/TTLCalcTransform.cpp b/src/Processors/Transforms/TTLCalcTransform.cpp index 204dfe21733..0af9f38b20f 100644 --- a/src/Processors/Transforms/TTLCalcTransform.cpp +++ b/src/Processors/Transforms/TTLCalcTransform.cpp @@ -7,13 +7,15 @@ namespace DB static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) { auto expr = ttl_descr.buildExpression(context); - auto where_expr = ttl_descr.buildWhereExpression(context); - auto expr_queries = expr.sets->getSubqueries(); - auto where_expr_queries = expr.sets->getSubqueries(); - subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); - subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + auto where_expr = ttl_descr.buildWhereExpression(context); + if (where_expr.sets) + { + auto where_expr_queries = where_expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + } return {expr.expression, where_expr.expression}; } diff --git a/src/Processors/Transforms/TTLTransform.cpp b/src/Processors/Transforms/TTLTransform.cpp index 69e2e6e5fc0..69b7d80c563 100644 --- a/src/Processors/Transforms/TTLTransform.cpp +++ b/src/Processors/Transforms/TTLTransform.cpp @@ -19,13 +19,15 @@ namespace DB static TTLExpressions getExpressions(const TTLDescription & ttl_descr, PreparedSets::Subqueries & subqueries_for_sets, const ContextPtr & context) { auto expr = ttl_descr.buildExpression(context); - auto where_expr = ttl_descr.buildWhereExpression(context); - auto expr_queries = expr.sets->getSubqueries(); - auto where_expr_queries = expr.sets->getSubqueries(); - subqueries_for_sets.insert(subqueries_for_sets.end(), expr_queries.begin(), expr_queries.end()); - subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + + auto where_expr = ttl_descr.buildWhereExpression(context); + if (where_expr.sets) + { + auto where_expr_queries = where_expr.sets->getSubqueries(); + subqueries_for_sets.insert(subqueries_for_sets.end(), where_expr_queries.begin(), where_expr_queries.end()); + } return {expr.expression, where_expr.expression}; } From 0015ec28f9f70548c31e220f2dd826e4ac21f007 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 5 Dec 2023 12:45:25 +0000 Subject: [PATCH 024/884] Fixing test. --- src/Storages/TTLDescription.cpp | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index e32ff11860b..bfd3afc30d8 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -18,6 +18,7 @@ #include #include #include +#include namespace DB @@ -172,11 +173,26 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndTypesList & columns, const ContextPtr & context) { ExpressionAndSets result; + auto ttl_string = queryToString(ast); auto syntax_analyzer_result = TreeRewriter(context).analyze(ast, columns); ExpressionAnalyzer analyzer(ast, syntax_analyzer_result, context); - result.expression = analyzer.getActions(false); + auto dag = analyzer.getActionsDAG(false); + + const auto * col = &dag->findInOutputs(ast->getColumnName()); + // std::cerr << "buildExpressionAndSets " << ttl_string << std::endl; + if (col->result_name != ttl_string) + col = &dag->addAlias(*col, ttl_string); + + dag->getOutputs() = {col}; + dag->removeUnusedActions(); + + result.expression = std::make_shared(dag, ExpressionActionsSettings::fromContext(context)); result.sets = analyzer.getPreparedSets(); + // std::cerr << "--------- buildExpressionAndSets\n"; + // std::cerr << result.expression->dumpActions() << std::endl; + // std::cerr << result.sets->getSubqueries().size() << std::endl; + return result; } @@ -218,7 +234,7 @@ TTLDescription TTLDescription::getTTLFromAST( // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); - result.result_column = ttl_ast->getColumnName(); + result.result_column = expression->getSampleBlock().safeGetByPosition(0).name; ExpressionActionsPtr where_expression; @@ -244,7 +260,7 @@ TTLDescription TTLDescription::getTTLFromAST( // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); - result.where_result_column = where_expr_ast->getColumnName(); + result.where_result_column = where_expression->getSampleBlock().safeGetByPosition(0).name; } } else if (ttl_element->mode == TTLMode::GROUP_BY) From 43a23898e0ddb71fe810dafd850cef911dace902 Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Tue, 5 Dec 2023 14:20:07 +0000 Subject: [PATCH 025/884] Updating the tests. --- .../0_stateless/01465_ttl_recompression.reference | 6 +++--- .../queries/0_stateless/02932_set_ttl_where.reference | 3 +++ tests/queries/0_stateless/02932_set_ttl_where.sql | 10 +--------- 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/tests/queries/0_stateless/01465_ttl_recompression.reference b/tests/queries/0_stateless/01465_ttl_recompression.reference index 108df565669..90661a5dc78 100644 --- a/tests/queries/0_stateless/01465_ttl_recompression.reference +++ b/tests/queries/0_stateless/01465_ttl_recompression.reference @@ -13,9 +13,9 @@ CREATE TABLE default.recompression_table\n(\n `dt` DateTime,\n `key` UInt6 1_1_1 LZ4 2_2_2 ZSTD(12) 3_3_3 ZSTD(12) -1_1_1 ['plus(dt, toIntervalDay(1))'] -2_2_2 ['plus(dt, toIntervalDay(1))'] -3_3_3 ['plus(dt, toIntervalDay(1))'] +1_1_1 ['dt + toIntervalDay(1)'] +2_2_2 ['dt + toIntervalDay(1)'] +3_3_3 ['dt + toIntervalDay(1)'] 1_1_1 LZ4 2_2_2 LZ4 3_3_3 LZ4 diff --git a/tests/queries/0_stateless/02932_set_ttl_where.reference b/tests/queries/0_stateless/02932_set_ttl_where.reference index e69de29bb2d..bb0b1cf658d 100644 --- a/tests/queries/0_stateless/02932_set_ttl_where.reference +++ b/tests/queries/0_stateless/02932_set_ttl_where.reference @@ -0,0 +1,3 @@ +0 +0 +0 diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql index 85fddf613e8..bf2b317c4bf 100644 --- a/tests/queries/0_stateless/02932_set_ttl_where.sql +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -1,18 +1,10 @@ -create or replace table temp ( - a UInt32 -) -engine = MergeTree -order by a; - -insert into temp select number from system.numbers limit 100_000; - create or replace table t_temp ( a UInt32, timestamp DateTime ) engine = MergeTree order by a -TTL timestamp + INTERVAL 2 SECOND WHERE a in (select a from temp); +TTL timestamp + INTERVAL 2 SECOND WHERE a in (select number from system.numbers limit 100_000); select sleep(1); insert into t_temp select rand(), now() from system.numbers limit 1_000_000; From 7dc7062dadd5ddf3bed3dea4364cabfa97bcd61a Mon Sep 17 00:00:00 2001 From: Nikolai Kochetov Date: Wed, 6 Dec 2023 12:53:14 +0000 Subject: [PATCH 026/884] Fixing test. --- src/Interpreters/PreparedSets.cpp | 3 ++- src/Interpreters/Set.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/Interpreters/PreparedSets.cpp b/src/Interpreters/PreparedSets.cpp index ea8d9a62b8b..9f646825d9f 100644 --- a/src/Interpreters/PreparedSets.cpp +++ b/src/Interpreters/PreparedSets.cpp @@ -189,7 +189,8 @@ SetPtr FutureSetFromSubquery::buildOrderedSetInplace(const ContextPtr & context) } } - set_and_key->set->fillSetElements(); + if (!set_and_key->set->hasSetElements()) + set_and_key->set->fillSetElements(); return buildSetInplace(context); } diff --git a/src/Interpreters/Set.h b/src/Interpreters/Set.h index 7136b090c42..7e8e0f2371b 100644 --- a/src/Interpreters/Set.h +++ b/src/Interpreters/Set.h @@ -77,6 +77,7 @@ public: const DataTypes & getElementsTypes() const { return set_elements_types; } bool hasExplicitSetElements() const { return fill_set_elements || (!set_elements.empty() && set_elements.front()->size() == data.getTotalRowCount()); } + bool hasSetElements() const { return !set_elements.empty(); } Columns getSetElements() const { checkIsCreated(); return { set_elements.begin(), set_elements.end() }; } void checkColumnsNumber(size_t num_key_columns) const; From 59153e865d4ffeda3c67cbdd945e14fdc860e446 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Tue, 19 Dec 2023 09:53:04 +0000 Subject: [PATCH 027/884] materialize column not to override past values Signed-off-by: Duc Canh Le --- src/Storages/MergeTree/MutateTask.cpp | 23 +++++++-- .../0_stateless/02008_materialize_column.sql | 1 + ..._column_not_override_past_values.reference | 29 +++++++++++ ...ialize_column_not_override_past_values.sql | 49 +++++++++++++++++++ 4 files changed, 97 insertions(+), 5 deletions(-) create mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference create mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index 827749aa094..a04d9cdb886 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -65,6 +65,7 @@ static void splitAndModifyMutationCommands( Poco::Logger * log) { auto part_columns = part->getColumnsDescription(); + const auto & table_columns = metadata_snapshot->getColumns(); if (!isWidePart(part) || !isFullPartStorage(part->getDataPartStorage())) { @@ -73,9 +74,16 @@ static void splitAndModifyMutationCommands( for (const auto & command : commands) { + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default expression, materialize column should not override past values + /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + mutated_columns.emplace(command.column_name); + } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL || command.type == MutationCommand::Type::DELETE @@ -85,8 +93,6 @@ static void splitAndModifyMutationCommands( for (const auto & [column_name, expr] : command.column_to_update_expression) mutated_columns.emplace(column_name); - if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) - mutated_columns.emplace(command.column_name); } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION @@ -196,8 +202,15 @@ static void splitAndModifyMutationCommands( { for (const auto & command : commands) { - if (command.type == MutationCommand::Type::MATERIALIZE_INDEX - || command.type == MutationCommand::Type::MATERIALIZE_COLUMN + if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) + { + /// For ordinary column with default expression, materialize column should not override past values + /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); + if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + for_interpreter.push_back(command); + } + else if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC || command.type == MutationCommand::Type::MATERIALIZE_PROJECTION || command.type == MutationCommand::Type::MATERIALIZE_TTL diff --git a/tests/queries/0_stateless/02008_materialize_column.sql b/tests/queries/0_stateless/02008_materialize_column.sql index a78920d2525..cc7d3096402 100644 --- a/tests/queries/0_stateless/02008_materialize_column.sql +++ b/tests/queries/0_stateless/02008_materialize_column.sql @@ -17,6 +17,7 @@ ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+2); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; +ALTER TABLE tmp CLEAR COLUMN s; -- Need to clear because MATERIALIZE COLUMN won't override past values; ALTER TABLE tmp MATERIALIZE COLUMN s; ALTER TABLE tmp MODIFY COLUMN s String DEFAULT toString(x+3); SELECT arraySort(groupArray(x)), groupArray(s) FROM tmp; diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference new file mode 100644 index 00000000000..6b0d88bd09b --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference @@ -0,0 +1,29 @@ +--Origin-- +1 2 +2 54321 +--After materialize-- +1 2 +2 54321 +--Origin-- +1 2 +2 54321 +--After materialize-- +1 2 +2 54321 +--Origin-- +1 2 +2 \N +3 54321 +--After materialize-- +1 2 +2 \N +3 54321 +--Origin-- +1 2 +2 54321 +--After rename-- +1 2 +2 54321 +--After materialize-- +1 2 +2 54321 diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql new file mode 100644 index 00000000000..1815661e097 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql @@ -0,0 +1,49 @@ + +SET mutations_sync = 2; +-- Compact parts +CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id ) values ( 2 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN foo; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; + +-- Wide parts +CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id ) values ( 2 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN foo; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; + +-- Nullable column != physically absent +CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id, foo ) values ( 2, NULL ); +INSERT INTO test ( id ) values ( 3 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN foo; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; + +-- Parts with renamed column +CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO test ( id, foo ) values ( 1, 2 ); +INSERT INTO test ( id ) values ( 2 ); +SELECT '--Origin--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test RENAME COLUMN foo TO bar; +SELECT '--After rename--'; +SELECT * FROM test ORDER BY id; +ALTER TABLE test MATERIALIZE COLUMN bar; +SELECT '--After materialize--'; +SELECT * FROM test ORDER BY id; +DROP TABLE test; \ No newline at end of file From a924b01a023512727d6a36fc12052f67438ba199 Mon Sep 17 00:00:00 2001 From: Justin de Guzman Date: Tue, 19 Dec 2023 02:05:32 -0800 Subject: [PATCH 028/884] [Docs] Clarify to use query level settings in ClickHouse Cloud --- docs/en/operations/query-cache.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/en/operations/query-cache.md b/docs/en/operations/query-cache.md index def0f48b968..2f05599e666 100644 --- a/docs/en/operations/query-cache.md +++ b/docs/en/operations/query-cache.md @@ -31,6 +31,10 @@ This reduces maintenance effort and avoids redundancy. ## Configuration Settings and Usage +:::note +In ClickHouse Cloud, you must use [query level settings](/en/operations/settings/query-level) to edit query cache settings. Editing [config level settings](/en/operations/configuration-files) is currently not supported. +::: + Setting [use_query_cache](settings/settings.md#use-query-cache) can be used to control whether a specific query or all queries of the current session should utilize the query cache. For example, the first execution of query From e832599dfab7ba2304a4a00175ce48f6a63ed701 Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 20 Dec 2023 04:57:56 +0000 Subject: [PATCH 029/884] fix materialize column for compact parts Signed-off-by: Duc Canh Le --- src/Storages/MergeTree/MutateTask.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index a04d9cdb886..dd84aa0d98a 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -80,7 +80,11 @@ static void splitAndModifyMutationCommands( /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) + { + LOG_DEBUG(log, "Materializing column {}\n", command.column_name); + for_interpreter.push_back(command); mutated_columns.emplace(command.column_name); + } } if (command.type == MutationCommand::Type::MATERIALIZE_INDEX || command.type == MutationCommand::Type::MATERIALIZE_STATISTIC @@ -92,7 +96,6 @@ static void splitAndModifyMutationCommands( for_interpreter.push_back(command); for (const auto & [column_name, expr] : command.column_to_update_expression) mutated_columns.emplace(column_name); - } else if (command.type == MutationCommand::Type::DROP_INDEX || command.type == MutationCommand::Type::DROP_PROJECTION From 7b49a0e530e2a2cb8629c249b96f43c6554ea51d Mon Sep 17 00:00:00 2001 From: Duc Canh Le Date: Wed, 20 Dec 2023 04:59:03 +0000 Subject: [PATCH 030/884] remove junk log Signed-off-by: Duc Canh Le --- src/Storages/MergeTree/MutateTask.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index dd84aa0d98a..bb41608eb00 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -81,7 +81,6 @@ static void splitAndModifyMutationCommands( auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) { - LOG_DEBUG(log, "Materializing column {}\n", command.column_name); for_interpreter.push_back(command); mutated_columns.emplace(command.column_name); } From bc757559c9f3fd1943bf338dc4fdac9e0e61240a Mon Sep 17 00:00:00 2001 From: una Date: Sat, 23 Dec 2023 18:10:42 +0800 Subject: [PATCH 031/884] feat:add InitialQuery event --- src/Common/ProfileEvents.cpp | 1 + src/Databases/DatabaseReplicatedWorker.cpp | 7 +++++-- src/Interpreters/DDLWorker.cpp | 2 +- .../queries/0_stateless/02950_initialquery_event.reference | 1 + tests/queries/0_stateless/02950_initialquery_event.sql | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02950_initialquery_event.reference create mode 100644 tests/queries/0_stateless/02950_initialquery_event.sql diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index f342a19b2aa..a2dc7f5ecd6 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -8,6 +8,7 @@ M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \ + M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).")\ M(QueriesWithSubqueries, "Count queries with all subqueries") \ M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \ M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \ diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 2056b403ff6..c90af7d4ea8 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -6,7 +6,10 @@ #include namespace fs = std::filesystem; - +namespace ProfileEvents +{ + extern const Event InitialQuery; +} namespace DB { @@ -264,7 +267,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - + ProfileEvents::increment(ProfileEvents::InitialQuery); LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f08fd72ff7f..ac3af6e441c 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -490,7 +490,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) diff --git a/tests/queries/0_stateless/02950_initialquery_event.reference b/tests/queries/0_stateless/02950_initialquery_event.reference new file mode 100644 index 00000000000..7ad67a1e7e4 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.reference @@ -0,0 +1 @@ +InitialQuery 6 Same as Query, but only counts initial queries (see is_initial_query). diff --git a/tests/queries/0_stateless/02950_initialquery_event.sql b/tests/queries/0_stateless/02950_initialquery_event.sql new file mode 100644 index 00000000000..2b03607c5c7 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.sql @@ -0,0 +1 @@ +SELECT * FROM system.events where event = 'InitialQuery' \ No newline at end of file From b38e7060ef455e6ae569d371203309a1ad992c66 Mon Sep 17 00:00:00 2001 From: una Date: Sat, 23 Dec 2023 18:36:23 +0800 Subject: [PATCH 032/884] feat:add InitialQuery event --- src/Common/ProfileEvents.cpp | 1 + src/Databases/DatabaseReplicatedWorker.cpp | 7 +++++-- src/Interpreters/DDLWorker.cpp | 2 +- .../queries/0_stateless/02950_initialquery_event.reference | 1 + tests/queries/0_stateless/02950_initialquery_event.sql | 1 + 5 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 tests/queries/0_stateless/02950_initialquery_event.reference create mode 100644 tests/queries/0_stateless/02950_initialquery_event.sql diff --git a/src/Common/ProfileEvents.cpp b/src/Common/ProfileEvents.cpp index f342a19b2aa..a2dc7f5ecd6 100644 --- a/src/Common/ProfileEvents.cpp +++ b/src/Common/ProfileEvents.cpp @@ -8,6 +8,7 @@ M(Query, "Number of queries to be interpreted and potentially executed. Does not include queries that failed to parse or were rejected due to AST size limits, quota limits or limits on the number of simultaneously running queries. May include internal queries initiated by ClickHouse itself. Does not count subqueries.") \ M(SelectQuery, "Same as Query, but only for SELECT queries.") \ M(InsertQuery, "Same as Query, but only for INSERT queries.") \ + M(InitialQuery, "Same as Query, but only counts initial queries (see is_initial_query).")\ M(QueriesWithSubqueries, "Count queries with all subqueries") \ M(SelectQueriesWithSubqueries, "Count SELECT queries with all subqueries") \ M(InsertQueriesWithSubqueries, "Count INSERT queries with all subqueries") \ diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 2056b403ff6..c90af7d4ea8 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -6,7 +6,10 @@ #include namespace fs = std::filesystem; - +namespace ProfileEvents +{ + extern const Event InitialQuery; +} namespace DB { @@ -264,7 +267,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - + ProfileEvents::increment(ProfileEvents::InitialQuery); LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index f08fd72ff7f..ac3af6e441c 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -490,7 +490,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) diff --git a/tests/queries/0_stateless/02950_initialquery_event.reference b/tests/queries/0_stateless/02950_initialquery_event.reference new file mode 100644 index 00000000000..7ad67a1e7e4 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.reference @@ -0,0 +1 @@ +InitialQuery 6 Same as Query, but only counts initial queries (see is_initial_query). diff --git a/tests/queries/0_stateless/02950_initialquery_event.sql b/tests/queries/0_stateless/02950_initialquery_event.sql new file mode 100644 index 00000000000..2b03607c5c7 --- /dev/null +++ b/tests/queries/0_stateless/02950_initialquery_event.sql @@ -0,0 +1 @@ +SELECT * FROM system.events where event = 'InitialQuery' \ No newline at end of file From 3e22f29b4529b6fefd5e92616ce9ef1ac33966d0 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sat, 23 Dec 2023 11:40:58 +0100 Subject: [PATCH 033/884] Fixed parameters --- docs/en/operations/backup.md | 2 +- .../registerBackupEngineAzureBlobStorage.cpp | 25 +++++++++++++++---- .../test.py | 2 +- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/docs/en/operations/backup.md b/docs/en/operations/backup.md index 15d953249a0..4871f97c270 100644 --- a/docs/en/operations/backup.md +++ b/docs/en/operations/backup.md @@ -463,7 +463,7 @@ To write backups to an AzureBlobStorage container you need the following pieces The destination for a backup will be specified like this: ``` -AzureBlobStorage('/', '', '', '', ') +AzureBlobStorage('/', '', '', '', '') ``` ```sql diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 6f7b5f38c28..ef95206831f 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -22,6 +22,7 @@ namespace ErrorCodes { extern const int BAD_ARGUMENTS; extern const int SUPPORT_IS_DISABLED; + extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } #if USE_AZURE_BLOB_STORAGE @@ -54,20 +55,34 @@ void registerBackupEngineAzureBlobStorage(BackupFactory & factory) StorageAzureBlob::Configuration configuration; - if (args.size() == 4) + if (args.size() == 3) { configuration.connection_url = args[0].safeGet(); configuration.is_connection_string = true; configuration.container = args[1].safeGet(); configuration.blob_path = args[2].safeGet(); - configuration.format = args[3].safeGet(); LOG_TRACE(&Poco::Logger::get("registerBackupEngineAzureBlobStorage"), "configuration.connection_url = {}" "configuration.container = {}" - "configuration.blob_path = {}" - "configuration.format = {}", - configuration.connection_url, configuration.container, configuration.blob_path, configuration.format); + "configuration.blob_path = {}", + configuration.connection_url, configuration.container, configuration.blob_path); + } + else if (args.size() == 5) + { + configuration.connection_url = args[0].safeGet(); + configuration.is_connection_string = false; + + configuration.container = args[1].safeGet(); + configuration.blob_path = args[2].safeGet(); + configuration.account_name = args[3].safeGet(); + configuration.account_key = args[4].safeGet(); + + } + else + { + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, + "Backup AzureBlobStorage requires 3 or 5 arguments: connection string>/ Date: Sat, 23 Dec 2023 18:42:41 +0800 Subject: [PATCH 034/884] feat:add InitialQuery event --- src/Interpreters/DDLWorker.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Interpreters/DDLWorker.cpp b/src/Interpreters/DDLWorker.cpp index ac3af6e441c..f08fd72ff7f 100644 --- a/src/Interpreters/DDLWorker.cpp +++ b/src/Interpreters/DDLWorker.cpp @@ -490,7 +490,7 @@ bool DDLWorker::tryExecuteQuery(DDLTaskBase & task, const ZooKeeperPtr & zookeep if (!task.is_initial_query) query_scope.emplace(query_context); - + executeQuery(istr, ostr, !task.is_initial_query, query_context, {}, QueryFlags{ .internal = false, .distributed_backup_restore = task.entry.is_backup_restore }); if (auto txn = query_context->getZooKeeperMetadataTransaction()) From fa5dde0bff8f34ebe85e1cc6e929f834c5e6b496 Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 12:37:06 +0800 Subject: [PATCH 035/884] feat: Add initial query event --- src/Databases/DatabaseReplicatedWorker.cpp | 6 +-- src/Interpreters/InterpreterFactory.cpp | 5 +- ..._distributed_initial_query_event.reference | 6 +++ .../02950_distributed_initial_query_event.sh | 54 +++++++++++++++++++ .../02950_initialquery_event.reference | 1 - .../0_stateless/02950_initialquery_event.sql | 1 - 6 files changed, 66 insertions(+), 7 deletions(-) create mode 100644 tests/queries/0_stateless/02950_distributed_initial_query_event.reference create mode 100644 tests/queries/0_stateless/02950_distributed_initial_query_event.sh delete mode 100644 tests/queries/0_stateless/02950_initialquery_event.reference delete mode 100644 tests/queries/0_stateless/02950_initialquery_event.sql diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index c90af7d4ea8..317cda3cd3d 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -7,9 +7,7 @@ namespace fs = std::filesystem; namespace ProfileEvents -{ - extern const Event InitialQuery; -} + namespace DB { @@ -267,7 +265,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - ProfileEvents::increment(ProfileEvents::InitialQuery); + LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index e32cbe4ccad..fdf7e8ebfbb 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -120,6 +120,7 @@ namespace ProfileEvents { extern const Event Query; + extern const Event InitialQuery; extern const Event QueriesWithSubqueries; extern const Event SelectQuery; extern const Event InsertQuery; @@ -137,7 +138,9 @@ namespace ErrorCodes std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMutablePtr context, const SelectQueryOptions & options) { ProfileEvents::increment(ProfileEvents::Query); - + + if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) + ProfileEvents::increment(ProfileEvents::InitialQuery); /// SELECT and INSERT query will handle QueriesWithSubqueries on their own. if (!(query->as() || query->as() || diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference new file mode 100644 index 00000000000..af8542c7204 --- /dev/null +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference @@ -0,0 +1,6 @@ +Local situation +Initial Query Difference: 1 +Query Difference: 1 +Distributed situation +Initial Query Difference: 1 +Query Difference: 2 diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh new file mode 100644 index 00000000000..3a01aa63d87 --- /dev/null +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -0,0 +1,54 @@ +-- Tags: distributed + +CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CUR_DIR"/../shell_config.sh +# CREATE TABLE local (x UInt8) Engine=Memory; +# CREATE TABLE distributed ON CLUSTER cluster (p Date, i Int32) ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), x) +$CLICKHOUSE_CLIENT -n -q " +DROP TABLE IF EXISTS local; +DROP TABLE IF EXISTS distributed; +CREATE TABLE local (x UInt8) Engine=Memory; +CREATE TABLE distributed AS local ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), local, x); +INSERT INTO distributed SELECT number FROM numbers(10); +SYSTEM FLUSH DISTRIBUTED distributed; +" +echo "Local situation" +# before SELECT * FROM local +query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Execute SELECT * FROM local +$CLICKHOUSE_CLIENT -q "SELECT * FROM local" > /dev/null + +# Counts after SELECT * FROM local +After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +After_query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Calculate the differences +Initial_query_diff=$(($After_query_countI-$query_countI-2)) +query_diff=$(($After_query_countQ-$query_countQ-2)) + +echo "Initial Query Difference: $Initial_query_diff" +echo "Query Difference: $query_diff" +echo "Distributed situation" + +# before SELECT * FROM distributed +query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Execute SELECT * FROM distributed +$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed" > /dev/null + +# Counts after SELECT * FROM distributed +After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") +After_query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") + +# Calculate the differences +Initial_query_diff=$(($After_query_countI-$query_countI-2)) +query_diff=$(($After_query_countQ-$query_countQ-2)) + +echo "Initial Query Difference: $Initial_query_diff" +echo "Query Difference: $query_diff" + + diff --git a/tests/queries/0_stateless/02950_initialquery_event.reference b/tests/queries/0_stateless/02950_initialquery_event.reference deleted file mode 100644 index 7ad67a1e7e4..00000000000 --- a/tests/queries/0_stateless/02950_initialquery_event.reference +++ /dev/null @@ -1 +0,0 @@ -InitialQuery 6 Same as Query, but only counts initial queries (see is_initial_query). diff --git a/tests/queries/0_stateless/02950_initialquery_event.sql b/tests/queries/0_stateless/02950_initialquery_event.sql deleted file mode 100644 index 2b03607c5c7..00000000000 --- a/tests/queries/0_stateless/02950_initialquery_event.sql +++ /dev/null @@ -1 +0,0 @@ -SELECT * FROM system.events where event = 'InitialQuery' \ No newline at end of file From 1464c3d1aab8c6ecdc369facceb1b9f6cf4b36fb Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 15:13:21 +0800 Subject: [PATCH 036/884] feat: Add initial query event --- src/Databases/DatabaseReplicatedWorker.cpp | 3 +-- .../02950_distributed_initial_query_event.reference | 2 +- .../0_stateless/02950_distributed_initial_query_event.sh | 7 +++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/Databases/DatabaseReplicatedWorker.cpp b/src/Databases/DatabaseReplicatedWorker.cpp index 317cda3cd3d..2056b403ff6 100644 --- a/src/Databases/DatabaseReplicatedWorker.cpp +++ b/src/Databases/DatabaseReplicatedWorker.cpp @@ -6,7 +6,6 @@ #include namespace fs = std::filesystem; -namespace ProfileEvents namespace DB { @@ -265,7 +264,7 @@ String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entr chassert(!task->entry.query.empty()); assert(!zookeeper->exists(task->getFinishedNodePath())); task->is_initial_query = true; - + LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name); UInt64 timeout = query_context->getSettingsRef().database_replicated_initial_query_timeout_sec; { diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference index af8542c7204..cf10427e9b3 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.reference +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.reference @@ -3,4 +3,4 @@ Initial Query Difference: 1 Query Difference: 1 Distributed situation Initial Query Difference: 1 -Query Difference: 2 +Query Difference: 3 diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh index 3a01aa63d87..c8a955c4fe5 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -1,4 +1,5 @@ --- Tags: distributed +#!/usr/bin/env bash +# Tags:no-parallel shard CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh @@ -38,7 +39,7 @@ query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE even query_countQ=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'Query'") # Execute SELECT * FROM distributed -$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed" > /dev/null +$CLICKHOUSE_CLIENT -q "SELECT * FROM distributed SETTINGS prefer_localhost_replica = 0" > /dev/null # Counts after SELECT * FROM distributed After_query_countI=$($CLICKHOUSE_CLIENT -q "SELECT value FROM system.events WHERE event = 'InitialQuery'") @@ -50,5 +51,3 @@ query_diff=$(($After_query_countQ-$query_countQ-2)) echo "Initial Query Difference: $Initial_query_diff" echo "Query Difference: $query_diff" - - From 22e1bcb9d638d5df0c43585b1d78228beedb0dc8 Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 16:12:10 +0800 Subject: [PATCH 037/884] feat:add InitialQuery event Signed-off-by: una --- .../0_stateless/02950_distributed_initial_query_event.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh index c8a955c4fe5..ddd0fb1e408 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags:no-parallel shard +# Tags:no-parallel, shard CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From a6f2eaf5a6ba2a26943d0c1c53c7cf7460a7471d Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 16:19:06 +0800 Subject: [PATCH 038/884] fix:use , to split tags Signed-off-by: una --- .../0_stateless/02950_distributed_initial_query_event.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh index ddd0fb1e408..7f690a681c4 100644 --- a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh +++ b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh @@ -1,5 +1,5 @@ #!/usr/bin/env bash -# Tags:no-parallel, shard +# Tags:no-parallel,shard CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From d46d91452176414426e40f598a7a1aa989f1a584 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 27 Dec 2023 10:28:52 +0100 Subject: [PATCH 039/884] Updated thread name --- src/Backups/BackupIO_AzureBlobStorage.cpp | 8 +- src/Backups/BackupIO_AzureBlobStorage.h | 81 +++++++++---------- .../copyAzureBlobStorageFile.cpp | 25 +++--- 3 files changed, 59 insertions(+), 55 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index d41d23e3c36..a1fd5bd8327 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -35,7 +35,7 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupReaderDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupReaderAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} , configuration(configuration_) { client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); @@ -160,7 +160,7 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage, "AzureBlobStorage", false, false} + , data_source_description{DataSourceType::AzureBlobStorage,configuration_.container, false, false} , configuration(configuration_) { client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); @@ -209,7 +209,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu settings, read_settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); return; /// copied! } } @@ -221,7 +221,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterAzureBlobStorage")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); } BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 6ef66fc432d..65affb9f079 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -12,57 +12,54 @@ namespace DB { -// using AzureClientPtr = std::shared_ptr; - /// Represents a backup stored to Azure - class BackupReaderAzureBlobStorage : public BackupReaderDefault - { - public: - BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); - ~BackupReaderAzureBlobStorage() override; +class BackupReaderAzureBlobStorage : public BackupReaderDefault +{ +public: + BackupReaderAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupReaderAzureBlobStorage() override; - bool fileExists(const String & file_name) override; - UInt64 getFileSize(const String & file_name) override; - std::unique_ptr readFile(const String & file_name) override; + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr readFile(const String & file_name) override; - void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, - DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; + void copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, + DiskPtr destination_disk, const String & destination_path, WriteMode write_mode) override; - private: - const DataSourceDescription data_source_description; - std::shared_ptr client; - StorageAzureBlob::Configuration configuration; - std::unique_ptr object_storage; - std::shared_ptr settings; - }; +private: + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; +}; +class BackupWriterAzureBlobStorage : public BackupWriterDefault +{ +public: + BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); + ~BackupWriterAzureBlobStorage() override; - class BackupWriterAzureBlobStorage : public BackupWriterDefault - { - public: - BackupWriterAzureBlobStorage(StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, const WriteSettings & write_settings_, const ContextPtr & context_); - ~BackupWriterAzureBlobStorage() override; + bool fileExists(const String & file_name) override; + UInt64 getFileSize(const String & file_name) override; + std::unique_ptr writeFile(const String & file_name) override; - bool fileExists(const String & file_name) override; - UInt64 getFileSize(const String & file_name) override; - std::unique_ptr writeFile(const String & file_name) override; + void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; + void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, + bool copy_encrypted, UInt64 start_pos, UInt64 length) override; - void copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) override; - void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, - bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void removeFile(const String & file_name) override; + void removeFiles(const Strings & file_names) override; - void removeFile(const String & file_name) override; - void removeFiles(const Strings & file_names) override; - - private: - std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; - void removeFilesBatch(const Strings & file_names); - const DataSourceDescription data_source_description; - std::shared_ptr client; - StorageAzureBlob::Configuration configuration; - std::unique_ptr object_storage; - std::shared_ptr settings; - }; +private: + std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; + void removeFilesBatch(const Strings & file_names); + const DataSourceDescription data_source_description; + std::shared_ptr client; + StorageAzureBlob::Configuration configuration; + std::unique_ptr object_storage; + std::shared_ptr settings; +}; } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index bf0bcac664b..0a0a080b5cb 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -22,6 +22,11 @@ namespace ProfileEvents extern const Event DiskAzureUploadPart; } +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + namespace DB { @@ -44,7 +49,8 @@ namespace std::shared_ptr settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, - bool for_disk_azure_blob_storage_) + bool for_disk_azure_blob_storage_, + const Poco::Logger * log_) : create_read_buffer(create_read_buffer_) , client(client_) , offset (offset_) @@ -55,7 +61,7 @@ namespace , object_metadata(object_metadata_) , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) - , log(&Poco::Logger::get("azureBlobStorageUploadHelper")) + , log(log_) , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) { } @@ -179,11 +185,11 @@ namespace try { auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); - auto buffer = std::make_unique(std::move(read_buffer), part_size); task->data = new char[part_size]; task->size = part_size; - buffer->read(task->data,part_size); - task->block_id = getRandomASCIIString(64); + size_t n = read_buffer->read(task->data,part_size); + if (n != part_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size"); schedule([this, task, task_finish_notify]() { @@ -208,9 +214,10 @@ namespace { UploadPartTask task; auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); - auto buffer = std::make_unique(std::move(read_buffer), part_size); task.data = new char[part_size]; - buffer->read(task.data,part_size); + size_t n = read_buffer->read(task.data,part_size); + if (n != part_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size"); task.size = part_size; processUploadTask(task); block_ids.emplace_back(task.block_id); @@ -274,7 +281,7 @@ void copyDataToAzureBlobStorageFile( ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; helper.performCopy(); } @@ -314,7 +321,7 @@ void copyAzureBlobStorageFile( settings->max_single_download_retries); }; - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; helper.performCopy(); } } From 0181bab23c38c2d1c15f199d522a4743b11586d6 Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 19:59:23 +0800 Subject: [PATCH 040/884] fix:style Signed-off-by: una --- src/Interpreters/InterpreterFactory.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/InterpreterFactory.cpp b/src/Interpreters/InterpreterFactory.cpp index fdf7e8ebfbb..c5d7f0f891c 100644 --- a/src/Interpreters/InterpreterFactory.cpp +++ b/src/Interpreters/InterpreterFactory.cpp @@ -138,7 +138,6 @@ namespace ErrorCodes std::unique_ptr InterpreterFactory::get(ASTPtr & query, ContextMutablePtr context, const SelectQueryOptions & options) { ProfileEvents::increment(ProfileEvents::Query); - if (context->getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY) ProfileEvents::increment(ProfileEvents::InitialQuery); /// SELECT and INSERT query will handle QueriesWithSubqueries on their own. From 2c1513540768eaed34a13fd643c4ace491421c0e Mon Sep 17 00:00:00 2001 From: una Date: Wed, 27 Dec 2023 20:53:30 +0800 Subject: [PATCH 041/884] fix test-file permissions Signed-off-by: una --- .../queries/0_stateless/02950_distributed_initial_query_event.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tests/queries/0_stateless/02950_distributed_initial_query_event.sh diff --git a/tests/queries/0_stateless/02950_distributed_initial_query_event.sh b/tests/queries/0_stateless/02950_distributed_initial_query_event.sh old mode 100644 new mode 100755 From 32ff152f2d7e4798a7bbc916808cc9ca883cf13e Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 28 Dec 2023 11:41:06 +0000 Subject: [PATCH 042/884] Support negtive position arguments --- .../replaceForPositionalArguments.cpp | 24 ++++- .../0_stateless/01798_having_push_down.sql | 3 +- .../02006_test_positional_arguments.reference | 94 +++++++++++++++++++ .../02006_test_positional_arguments.sql | 21 +++++ .../02932_group_by_null_fuzzer.sql | 1 + 5 files changed, 137 insertions(+), 6 deletions(-) diff --git a/src/Interpreters/replaceForPositionalArguments.cpp b/src/Interpreters/replaceForPositionalArguments.cpp index 241dd7cf92c..bea87ad913a 100644 --- a/src/Interpreters/replaceForPositionalArguments.cpp +++ b/src/Interpreters/replaceForPositionalArguments.cpp @@ -27,14 +27,28 @@ bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * sel return false; auto which = ast_literal->value.getType(); - if (which != Field::Types::UInt64) + if (which != Field::Types::UInt64 && which != Field::Types::Int64) return false; - auto pos = ast_literal->value.get(); + UInt64 pos; + + if (which == Field::Types::UInt64) + { + pos = ast_literal->value.get(); + } + else if (which == Field::Types::Int64) + { + auto value = ast_literal->value.get(); + pos = value > 0 ? value : columns.size() + value + 1; + } + else + { + return false; + } + if (!pos || pos > columns.size()) - throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, - "Positional argument out of bounds: {} (expected in range [1, {}]", - pos, columns.size()); + throw Exception( + ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Positional argument out of bounds: {} (expected in range [1, {}]", pos, columns.size()); const auto & column = columns[--pos]; if (typeid_cast(column.get()) || typeid_cast(column.get())) diff --git a/tests/queries/0_stateless/01798_having_push_down.sql b/tests/queries/0_stateless/01798_having_push_down.sql index b3a77c8f5b5..c0c3447f5ab 100644 --- a/tests/queries/0_stateless/01798_having_push_down.sql +++ b/tests/queries/0_stateless/01798_having_push_down.sql @@ -8,11 +8,12 @@ SELECT sum(c0 = 0), min(c0 + 1), sum(c0 + 2) FROM t_having GROUP BY c0 HAVING c0 = 0 SETTINGS enable_optimize_predicate_expression=0; +SET enable_positional_arguments=0; + SELECT c0 + -1, sum(intDivOrZero(intDivOrZero(NULL, NULL), '2'), intDivOrZero(10000000000., intDivOrZero(intDivOrZero(intDivOrZero(NULL, NULL), 10), NULL))) FROM t_having GROUP BY c0 = 2, c0 = 10, intDivOrZero(intDivOrZero(intDivOrZero(NULL, NULL), NULL), NULL), c0 HAVING c0 = 2 SETTINGS enable_optimize_predicate_expression = 0; SELECT sum(c0 + 257) FROM t_having GROUP BY c0 = -9223372036854775808, NULL, -2147483649, c0 HAVING c0 = -9223372036854775808 SETTINGS enable_optimize_predicate_expression = 0; -SET enable_positional_arguments=0; SELECT c0 + -2, c0 + -9223372036854775807, c0 = NULL FROM t_having GROUP BY c0 = 0.9998999834060669, 1023, c0 HAVING c0 = 0.9998999834060669 SETTINGS enable_optimize_predicate_expression = 0; DROP TABLE t_having; diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.reference b/tests/queries/0_stateless/02006_test_positional_arguments.reference index 40100e8d5be..079bd071103 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.reference +++ b/tests/queries/0_stateless/02006_test_positional_arguments.reference @@ -3,18 +3,50 @@ select x3, x2, x1 from test order by 1; 1 100 100 10 1 10 100 10 1 +select x3, x2, x1 from test order by -3; +1 100 100 +10 1 10 +100 10 1 select x3, x2, x1 from test order by x3; 1 100 100 10 1 10 100 10 1 +select x3, x2, x1 from test order by 3; +100 10 1 +10 1 10 +1 100 100 +select x3, x2, x1 from test order by -1; +100 10 1 +10 1 10 +1 100 100 +select x3, x2, x1 from test order by x1; +100 10 1 +10 1 10 +1 100 100 select x3, x2, x1 from test order by 1 desc; 100 10 1 10 1 10 1 100 100 +select x3, x2, x1 from test order by -3 desc; +100 10 1 +10 1 10 +1 100 100 select x3, x2, x1 from test order by x3 desc; 100 10 1 10 1 10 1 100 100 +select x3, x2, x1 from test order by 3 desc; +1 100 100 +10 1 10 +100 10 1 +select x3, x2, x1 from test order by -1 desc; +1 100 100 +10 1 10 +100 10 1 +select x3, x2, x1 from test order by x1 desc; +1 100 100 +10 1 10 +100 10 1 insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x3, x2 from test group by x3, x2 order by x3; 1 100 @@ -54,6 +86,20 @@ SELECT x1 FROM test ORDER BY x3 + 1 ASC +explain syntax select x3, x2, x1 from test order by -1; +SELECT + x3, + x2, + x1 +FROM test +ORDER BY x1 ASC +explain syntax select x3 + 1, x2, x1 from test order by -1; +SELECT + x3 + 1, + x2, + x1 +FROM test +ORDER BY x1 ASC explain syntax select x3, x3 - x2, x2, x1 from test order by 2; SELECT x3, @@ -62,6 +108,14 @@ SELECT x1 FROM test ORDER BY x3 - x2 ASC +explain syntax select x3, x3 - x2, x2, x1 from test order by -2; +SELECT + x3, + x3 - x2, + x2, + x1 +FROM test +ORDER BY x2 ASC explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by 2; SELECT x3, @@ -69,12 +123,28 @@ SELECT x1 + x2 FROM test ORDER BY if(x3 > 10, x3, x1 + x2) ASC +explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by -2; +SELECT + x3, + if(x3 > 10, x3, x1 + x2), + x1 + x2 +FROM test +ORDER BY if(x3 > 10, x3, x1 + x2) ASC explain syntax select max(x1), x2 from test group by 2 order by 1, 2; SELECT max(x1), x2 FROM test GROUP BY x2 +ORDER BY + max(x1) ASC, + x2 ASC +explain syntax select max(x1), x2 from test group by -1 order by -2, -1; +SELECT + max(x1), + x2 +FROM test +GROUP BY x2 ORDER BY max(x1) ASC, x2 ASC @@ -83,16 +153,34 @@ SELECT 1 + greatest(x1, 1), x2 FROM test +GROUP BY + 1 + greatest(x1, 1), + x2 +explain syntax select 1 + greatest(x1, 1), x2 from test group by -2, -1; +SELECT + 1 + greatest(x1, 1), + x2 +FROM test GROUP BY 1 + greatest(x1, 1), x2 select max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } +select max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } +select 1 + max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } explain syntax select x1 + x3, x3 from test group by 1, 2; SELECT x1 + x3, x3 FROM test +GROUP BY + x1 + x3, + x3 +explain syntax select x1 + x3, x3 from test group by -2, -1; +SELECT + x1 + x3, + x3 +FROM test GROUP BY x1 + x3, x3 @@ -102,8 +190,14 @@ select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, 1 2 10 100 10 20 1 10 100 200 100 1 +select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, -1 desc, -2 asc; +1 2 10 100 +10 20 1 10 +100 200 100 1 select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,4,5,6 order by a; 44 88 13 14 15 16 +select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,-3,-2,-1 order by a; +44 88 13 14 15 16 explain syntax select plus(1, 1) as a group by a; SELECT 1 + 1 AS a GROUP BY a diff --git a/tests/queries/0_stateless/02006_test_positional_arguments.sql b/tests/queries/0_stateless/02006_test_positional_arguments.sql index 159ad6bd427..6f427e0298d 100644 --- a/tests/queries/0_stateless/02006_test_positional_arguments.sql +++ b/tests/queries/0_stateless/02006_test_positional_arguments.sql @@ -9,11 +9,21 @@ insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); -- { echo } select x3, x2, x1 from test order by 1; +select x3, x2, x1 from test order by -3; select x3, x2, x1 from test order by x3; +select x3, x2, x1 from test order by 3; +select x3, x2, x1 from test order by -1; +select x3, x2, x1 from test order by x1; + select x3, x2, x1 from test order by 1 desc; +select x3, x2, x1 from test order by -3 desc; select x3, x2, x1 from test order by x3 desc; +select x3, x2, x1 from test order by 3 desc; +select x3, x2, x1 from test order by -1 desc; +select x3, x2, x1 from test order by x1 desc; + insert into test values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x3, x2 from test group by x3, x2 order by x3; select x3, x2 from test group by 1, 2 order by x3; @@ -25,21 +35,32 @@ select x1, x2, x3 from test order by 3 limit 1 by 1; explain syntax select x3, x2, x1 from test order by 1; explain syntax select x3 + 1, x2, x1 from test order by 1; +explain syntax select x3, x2, x1 from test order by -1; +explain syntax select x3 + 1, x2, x1 from test order by -1; explain syntax select x3, x3 - x2, x2, x1 from test order by 2; +explain syntax select x3, x3 - x2, x2, x1 from test order by -2; explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by 2; +explain syntax select x3, if(x3 > 10, x3, plus(x1, x2)), x1 + x2 from test order by -2; explain syntax select max(x1), x2 from test group by 2 order by 1, 2; +explain syntax select max(x1), x2 from test group by -1 order by -2, -1; explain syntax select 1 + greatest(x1, 1), x2 from test group by 1, 2; +explain syntax select 1 + greatest(x1, 1), x2 from test group by -2, -1; select max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } select 1 + max(x1), x2 from test group by 1, 2; -- { serverError 43, 184 } +select max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } +select 1 + max(x1), x2 from test group by -2, -1; -- { serverError 43, 184 } explain syntax select x1 + x3, x3 from test group by 1, 2; +explain syntax select x1 + x3, x3 from test group by -2, -1; create table test2(x1 Int, x2 Int, x3 Int) engine=Memory; insert into test2 values (1, 10, 100), (10, 1, 10), (100, 100, 1); select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, 4 desc, 3 asc; +select x1, x1 * 2, max(x2), max(x3) from test2 group by 2, 1, x1 order by 1, 2, -1 desc, -2 asc; select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,4,5,6 order by a; +select a, b, c, d, e, f from (select 44 a, 88 b, 13 c, 14 d, 15 e, 16 f) t group by 1,2,3,-3,-2,-1 order by a; explain syntax select plus(1, 1) as a group by a; select substr('aaaaaaaaaaaaaa', 8) as a group by a order by a; diff --git a/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql b/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql index 0c28c120d40..603c7783ef8 100644 --- a/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql +++ b/tests/queries/0_stateless/02932_group_by_null_fuzzer.sql @@ -1,5 +1,6 @@ -- https://github.com/ClickHouse/ClickHouse/issues/43202 -- Queries are generated by the fuzzer, so don't expect them to make sense +SET enable_positional_arguments=0; SELECT NULL, '' FROM (SELECT toNullable(''), NULL AS key GROUP BY GROUPING SETS ((NULL))) AS s1 ALL LEFT JOIN (SELECT '' AS key, NULL AS value GROUP BY GROUPING SETS (('')) WITH TOTALS UNION ALL SELECT NULL AS key, toNullable(NULL) AS value GROUP BY '', NULL, '' WITH TOTALS) AS s2 USING (key); SELECT NULL GROUP BY NULL WITH TOTALS; SELECT 1048575, NULL, b FROM (SELECT '25.5' AS a, NULL, NULL AS b GROUP BY GROUPING SETS ((0.0001)) WITH TOTALS) AS js1 ANY RIGHT JOIN (SELECT NULL AS a, NULL AS b WHERE NULL GROUP BY NULL, -9223372036854775807 WITH CUBE WITH TOTALS UNION ALL SELECT NULL AS a, NULL AS b GROUP BY 1, '21474836.46' WITH TOTALS) AS js2 USING (a, b) ORDER BY nan DESC NULLS LAST, '9223372036854775807' DESC NULLS LAST, a ASC NULLS LAST; From e2f4219c12c216ab32a267b153969b758126a077 Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 28 Dec 2023 12:22:30 +0000 Subject: [PATCH 043/884] Fix --- src/Interpreters/TreeOptimizer.cpp | 7 +++---- .../02943_positional_arguments_bugs.reference | 11 ++++++++++- .../0_stateless/02943_positional_arguments_bugs.sql | 13 +++++++------ 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/src/Interpreters/TreeOptimizer.cpp b/src/Interpreters/TreeOptimizer.cpp index 729e2ed6007..57dba3eef89 100644 --- a/src/Interpreters/TreeOptimizer.cpp +++ b/src/Interpreters/TreeOptimizer.cpp @@ -76,11 +76,10 @@ const std::unordered_set possibly_injective_function_names */ void appendUnusedGroupByColumn(ASTSelectQuery * select_query) { - /// You must insert a constant that is not the name of the column in the table. Such a case is rare, but it happens. - /// Also start unused_column integer must not intersect with ([1, source_columns.size()]) - /// might be in positional GROUP BY. + /// Since ASTLiteral is different from ASTIdentifier, so we can use a special constant String Literal for this, + /// and do not need to worry about it conflict with the name of the column in the table. select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, std::make_shared()); - select_query->groupBy()->children.emplace_back(std::make_shared(static_cast(-1))); + select_query->groupBy()->children.emplace_back(std::make_shared("__unused_group_by_column")); } /// Eliminates injective function calls and constant expressions from group by statement. diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference index 702e1261186..47e8df9e382 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference @@ -1,2 +1,11 @@ -45 1 +0 0 +4 4 +3 3 +2 2 +5 5 +1 1 +6 6 +7 7 +9 9 +8 8 processed 99 0 diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql index b8cf73da42d..8cc3fb4b17d 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql @@ -2,18 +2,19 @@ DROP TABLE IF EXISTS t; CREATE TABLE t ( - `n` int + `n` int, + `__unused_group_by_column` int ) - ENGINE = MergeTree - ORDER BY n AS -SELECT * +ENGINE = MergeTree +ORDER BY n AS +SELECT number, number FROM numbers(10); SELECT sum(n), - 1 AS x + __unused_group_by_column FROM t -GROUP BY x; +GROUP BY __unused_group_by_column; SELECT 'processed' AS type, From 2e9cdd17ef136f064042b541dbc68ef64ba8194f Mon Sep 17 00:00:00 2001 From: flynn Date: Thu, 28 Dec 2023 14:08:14 +0000 Subject: [PATCH 044/884] Fix flaky test --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 26 ++++++++++++++----- .../02943_positional_arguments_bugs.reference | 11 ++++---- .../02943_positional_arguments_bugs.sql | 6 +++-- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 3290d918a8b..9ec6d9e358c 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -2156,19 +2156,31 @@ void QueryAnalyzer::replaceNodesWithPositionalArguments(QueryTreeNodePtr & node_ node_to_replace = &sort_node->getExpression(); auto * constant_node = (*node_to_replace)->as(); - if (!constant_node || constant_node->getValue().getType() != Field::Types::UInt64) + + if (!constant_node + || (constant_node->getValue().getType() != Field::Types::UInt64 && constant_node->getValue().getType() != Field::Types::Int64)) continue; - UInt64 positional_argument_number = constant_node->getValue().get(); - if (positional_argument_number == 0 || positional_argument_number > projection_nodes.size()) - throw Exception(ErrorCodes::BAD_ARGUMENTS, + UInt64 pos; + if (constant_node->getValue().getType() == Field::Types::UInt64) + { + pos = constant_node->getValue().get(); + } + else // Int64 + { + auto value = constant_node->getValue().get(); + pos = value > 0 ? value : projection_nodes.size() + value + 1; + } + + if (!pos || pos > projection_nodes.size()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, "Positional argument number {} is out of bounds. Expected in range [1, {}]. In scope {}", - positional_argument_number, + pos, projection_nodes.size(), scope.scope_node->formatASTForErrorMessage()); - --positional_argument_number; - *node_to_replace = projection_nodes[positional_argument_number]; + *node_to_replace = projection_nodes[--pos]; } } diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference index 47e8df9e382..08310b7cf27 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.reference +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.reference @@ -1,11 +1,12 @@ 0 0 -4 4 -3 3 -2 2 -5 5 1 1 +2 2 +3 3 +4 4 +5 5 6 6 7 7 -9 9 8 8 +9 9 +45 1 processed 99 0 diff --git a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql index 8cc3fb4b17d..9b1b872ae40 100644 --- a/tests/queries/0_stateless/02943_positional_arguments_bugs.sql +++ b/tests/queries/0_stateless/02943_positional_arguments_bugs.sql @@ -3,7 +3,7 @@ DROP TABLE IF EXISTS t; CREATE TABLE t ( `n` int, - `__unused_group_by_column` int + `__unused_group_by_column` int ) ENGINE = MergeTree ORDER BY n AS @@ -14,7 +14,9 @@ SELECT sum(n), __unused_group_by_column FROM t -GROUP BY __unused_group_by_column; +GROUP BY __unused_group_by_column ORDER BY __unused_group_by_column; + +SELECT sum(n), 1 as x from t group by x; SELECT 'processed' AS type, From b70ff6d8ea71d4633cdcdbe3ef486707e70c1abb Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 1 Jan 2024 11:02:57 +0100 Subject: [PATCH 045/884] Fix build --- src/Backups/BackupIO_AzureBlobStorage.cpp | 33 +++++++++++++++++++++-- src/Backups/BackupIO_AzureBlobStorage.h | 2 ++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index a1fd5bd8327..bd4efcf63ae 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -218,10 +218,39 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu BackupWriterDefault::copyFileFromDisk(path_in_backup, src_disk, src_path, copy_encrypted, start_pos, length); } +void BackupWriterAzureBlobStorage::copyFile(const String & destination, const String & source, size_t size) +{ + std::shared_ptr src_client; + std::shared_ptr dest_client; + StorageAzureBlob::Configuration src_configuration = configuration; + src_configuration.container = source; + src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); + + StorageAzureBlob::Configuration dest_configuration = configuration; + dest_configuration.container = destination; + dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); + + LOG_TRACE(log, "Copying file inside backup from {} to {} ", source, destination); + copyAzureBlobStorageFile( + src_client, + dest_client, + configuration.container, + fs::path(configuration.blob_path), + 0, + size, + /* dest_bucket= */ destination, + /* dest_key= */ configuration.blob_path, + settings, + read_settings, + {}, + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), + /* for_disk_azure_blob_storage= */ true); +} + void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); } BackupWriterAzureBlobStorage::~BackupWriterAzureBlobStorage() = default; @@ -257,7 +286,7 @@ UInt64 BackupWriterAzureBlobStorage::getFileSize(const String & file_name) RelativePathsWithMetadata children; object_storage->listObjects(key,children,/*max_keys*/0); if (children.empty()) - throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object {} must exist"); + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Object must exist"); return children[0].metadata.size_bytes; } diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 65affb9f079..87a6c3ef675 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -48,6 +48,8 @@ public: void copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, bool copy_encrypted, UInt64 start_pos, UInt64 length) override; + void copyFile(const String & destination, const String & source, size_t size) override; + void removeFile(const String & file_name) override; void removeFiles(const Strings & file_names) override; From 4122de97213d835de5202d4ca741b4972973884b Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 2 Jan 2024 20:19:01 +0100 Subject: [PATCH 046/884] Updated tests and added settings --- src/Backups/BackupIO_AzureBlobStorage.cpp | 6 +- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 5 +- .../AzureBlobStorage/AzureObjectStorage.h | 11 ++- .../copyAzureBlobStorageFile.cpp | 68 +++++++++++++++++-- src/Storages/StorageAzureBlob.cpp | 2 +- .../configs/config.xml | 11 --- .../configs/disable_profilers.xml | 13 ---- .../configs/users.xml | 8 --- .../test.py | 2 - 9 files changed, 80 insertions(+), 46 deletions(-) delete mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml delete mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml delete mode 100644 tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index bd4efcf63ae..15e8e92a85d 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -139,7 +139,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, settings, read_settings, object_attributes, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupReaderAzureBlobStorage"), + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), /* for_disk_azure_blob_storage= */ true); return file_size; @@ -209,7 +209,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu settings, read_settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWriterS3")); + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); return; /// copied! } } @@ -243,7 +243,7 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St settings, read_settings, {}, - threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), + threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure"), /* for_disk_azure_blob_storage= */ true); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 6075b385a6c..9e703d6fc5e 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -164,7 +164,10 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getUInt64(config_prefix + ".min_bytes_for_seek", 1024 * 1024), config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), - config.getInt(config_prefix + ".list_object_keys_size", 1000) + config.getInt(config_prefix + ".list_object_keys_size", 1000), + config.getUInt64(config_prefix + ".min_upload_part_size", 16 * 1024 * 1024), + config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), + config.getUInt64(config_prefix + ".max_part_number", 10000) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 8e3d50418d3..55c81b4b7d9 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -24,12 +24,18 @@ struct AzureObjectStorageSettings uint64_t min_bytes_for_seek_, int max_single_read_retries_, int max_single_download_retries_, - int list_object_keys_size_) + int list_object_keys_size_, + size_t min_upload_part_size_, + size_t max_upload_part_size_, + size_t max_part_number_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) + , min_upload_part_size(min_upload_part_size_) + , max_upload_part_size(max_upload_part_size_) + , max_part_number(max_part_number_) { } @@ -40,6 +46,9 @@ struct AzureObjectStorageSettings size_t max_single_read_retries = 3; size_t max_single_download_retries = 3; int list_object_keys_size = 1000; + size_t min_upload_part_size = 16 * 1024 * 1024; + size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; + size_t max_part_number = 10000; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 0a0a080b5cb..5ca30fa8071 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -22,15 +22,17 @@ namespace ProfileEvents extern const Event DiskAzureUploadPart; } -namespace ErrorCodes -{ - extern const int LOGICAL_ERROR; -} - namespace DB { +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; + extern const int INVALID_CONFIG_PARAMETER; +} + + size_t max_single_operation_copy_size = 256 * 1024 * 1024; @@ -106,6 +108,60 @@ namespace std::mutex bg_tasks_mutex; std::condition_variable bg_tasks_condvar; + void calculatePartSize() + { + if (!total_size) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); + + auto max_part_number = settings.get()->max_part_number; + auto min_upload_part_size = settings.get()->min_upload_part_size; + auto max_upload_part_size = settings.get()->max_upload_part_size; + + if (!max_part_number) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); + else if (!min_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "min_upload_part_size must not be 0"); + else if (max_upload_part_size < min_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be less than min_upload_part_size"); + + size_t part_size = min_upload_part_size; + size_t num_parts = (total_size + part_size - 1) / part_size; + + if (num_parts > max_part_number) + { + part_size = (total_size + max_part_number - 1) / max_part_number; + num_parts = (total_size + part_size - 1) / part_size; + } + + if (part_size > max_upload_part_size) + { + part_size = max_upload_part_size; + num_parts = (total_size + part_size - 1) / part_size; + } + + if (num_parts < 1 || num_parts > max_part_number || part_size < min_upload_part_size || part_size > max_upload_part_size) + { + String msg; + if (num_parts < 1) + msg = "Number of parts is zero"; + else if (num_parts > max_part_number) + msg = fmt::format("Number of parts exceeds {}", num_parts, max_part_number); + else if (part_size < min_upload_part_size) + msg = fmt::format("Size of a part is less than {}", part_size, min_upload_part_size); + else + msg = fmt::format("Size of a part exceeds {}", part_size, max_upload_part_size); + + throw Exception( + ErrorCodes::INVALID_CONFIG_PARAMETER, + "{} while writing {} bytes to AzureBlobStorage. Check max_part_number = {}, " + "min_upload_part_size = {}, max_upload_part_size = {}", + msg, total_size, max_part_number, min_upload_part_size, max_upload_part_size); + } + + /// We've calculated the size of a normal part (the final part can be smaller). + normal_part_size = part_size; + } + public: void performCopy() { @@ -120,7 +176,7 @@ namespace void performMultipartUpload() { - normal_part_size = 1024; + calculatePartSize(); size_t position = offset; size_t end_position = offset + total_size; diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 1b28a2c2fac..f1070c8c31e 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -1139,7 +1139,7 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files + std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; if (num_rows_from_cache) { diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml deleted file mode 100644 index 5725dce40cd..00000000000 --- a/tests/integration/test_backup_restore_azure_blob_storage/configs/config.xml +++ /dev/null @@ -1,11 +0,0 @@ - - 1 - 0 - 0.0 - 0 - 1 - 1 - 0 - 16 - 16 - \ No newline at end of file diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml deleted file mode 100644 index b74bb1502ce..00000000000 --- a/tests/integration/test_backup_restore_azure_blob_storage/configs/disable_profilers.xml +++ /dev/null @@ -1,13 +0,0 @@ - - - - - 0 - 0 - 0 - 1000 - 1 - 1 - - - diff --git a/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml b/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml deleted file mode 100644 index c12eb2f79f4..00000000000 --- a/tests/integration/test_backup_restore_azure_blob_storage/configs/users.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - default - - - diff --git a/tests/integration/test_backup_restore_azure_blob_storage/test.py b/tests/integration/test_backup_restore_azure_blob_storage/test.py index 0a48d3523f0..06c18d7468f 100644 --- a/tests/integration/test_backup_restore_azure_blob_storage/test.py +++ b/tests/integration/test_backup_restore_azure_blob_storage/test.py @@ -24,8 +24,6 @@ def cluster(): cluster = ClickHouseCluster(__file__) cluster.add_instance( "node", - main_configs=["configs/config.xml"], - user_configs=["configs/disable_profilers.xml", "configs/users.xml"], with_azurite=True, ) cluster.start() From df221f7db65fd17af6a71704f756e47ceec7a928 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 3 Jan 2024 11:35:06 +0100 Subject: [PATCH 047/884] Renamed Bucket-Key to Container-Blob --- src/Backups/BackupIO_AzureBlobStorage.cpp | 14 +++--- .../copyAzureBlobStorageFile.cpp | 44 +++++++++---------- .../copyAzureBlobStorageFile.h | 10 ++--- 3 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 15e8e92a85d..de40fc6b33b 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -134,8 +134,8 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, fs::path(configuration.blob_path) / path_in_backup, 0, file_size, - /* dest_bucket= */ blob_path[1], - /* dest_key= */ blob_path[0], + /* dest_container */ blob_path[1], + /* dest_path */ blob_path[0], settings, read_settings, object_attributes, @@ -178,7 +178,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu auto source_data_source_description = src_disk->getDataSourceDescription(); if (source_data_source_description.sameKind(data_source_description) && (source_data_source_description.is_encrypted == copy_encrypted)) { - /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in AzureBlobStorage bucket. + /// getBlobPath() can return more than 3 elements if the file is stored as multiple objects in AzureBlobStorage container. /// In this case we can't use the native copy. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { @@ -200,8 +200,8 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu copyAzureBlobStorageFile( src_client, client, - /* src_bucket */ blob_path[1], - /* src_key= */ blob_path[0], + /* src_container */ blob_path[1], + /* src_path */ blob_path[0], start_pos, length, configuration.container, @@ -238,8 +238,8 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St fs::path(configuration.blob_path), 0, size, - /* dest_bucket= */ destination, - /* dest_key= */ configuration.blob_path, + /* dest_container */ destination, + /* dest_path */ configuration.blob_path, settings, read_settings, {}, diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 5ca30fa8071..df1341efdd1 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -46,8 +46,8 @@ namespace std::shared_ptr client_, size_t offset_, size_t total_size_, - const String & dest_bucket_, - const String & dest_key_, + const String & dest_container_, + const String & dest_blob_, std::shared_ptr settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, @@ -57,8 +57,8 @@ namespace , client(client_) , offset (offset_) , total_size (total_size_) - , dest_bucket(dest_bucket_) - , dest_key(dest_key_) + , dest_container(dest_container_) + , dest_blob(dest_blob_) , settings(settings_) , object_metadata(object_metadata_) , schedule(schedule_) @@ -75,8 +75,8 @@ namespace std::shared_ptr client; size_t offset; size_t total_size; - const String & dest_bucket; - const String & dest_key; + const String & dest_container; + const String & dest_blob; std::shared_ptr settings; const std::optional> & object_metadata; ThreadPoolCallbackRunner schedule; @@ -170,7 +170,7 @@ namespace void completeMultipartUpload() { - auto block_blob_client = client->GetBlockBlobClient(dest_key); + auto block_blob_client = client->GetBlockBlobClient(dest_blob); block_blob_client.CommitBlockList(block_ids); } @@ -207,7 +207,7 @@ namespace void uploadPart(size_t part_offset, size_t part_size) { - LOG_TRACE(log, "Writing part. Bucket: {}, Key: {}, Size: {}", dest_bucket, dest_key, part_size); + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, Size: {}", dest_container, dest_blob, part_size); if (!part_size) { @@ -286,7 +286,7 @@ namespace std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race task.block_id = block_id; - LOG_TRACE(log, "Writing part finished. Bucket: {}, Key: {}, block_id: {}, Parts: {}", dest_bucket, dest_key, block_id, bg_tasks.size()); + LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, block_id: {}, Parts: {}", dest_container, dest_blob, block_id, bg_tasks.size()); } String processUploadPartRequest(UploadPartTask & task) @@ -295,7 +295,7 @@ namespace if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); - auto block_blob_client = client->GetBlockBlobClient(dest_key); + auto block_blob_client = client->GetBlockBlobClient(dest_blob); task.block_id = getRandomASCIIString(64); Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); block_blob_client.StageBlock(task.block_id, memory); @@ -330,14 +330,14 @@ void copyDataToAzureBlobStorageFile( size_t offset, size_t size, std::shared_ptr & dest_client, - const String & dest_bucket, - const String & dest_key, + const String & dest_container, + const String & dest_blob, std::shared_ptr settings, const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; helper.performCopy(); } @@ -345,12 +345,12 @@ void copyDataToAzureBlobStorageFile( void copyAzureBlobStorageFile( std::shared_ptr src_client, std::shared_ptr dest_client, - const String & src_bucket, - const String & src_key, + const String & src_container, + const String & src_blob, size_t offset, size_t size, - const String & dest_bucket, - const String & dest_key, + const String & dest_container, + const String & dest_blob, std::shared_ptr settings, const ReadSettings & read_settings, const std::optional> & object_metadata, @@ -363,21 +363,21 @@ void copyAzureBlobStorageFile( ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - auto block_blob_client_src = src_client->GetBlockBlobClient(src_key); - auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_key); + auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); auto uri = block_blob_client_src.GetUrl(); block_blob_client_dest.CopyFromUri(uri); } else { - LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Bucket: {}, Key: {}", src_bucket, src_key); + LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client, src_key, read_settings, settings->max_single_read_retries, + return std::make_unique(src_client, src_blob, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); }; - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_bucket, dest_key, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; helper.performCopy(); } } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 31228fbcb23..059d0318f57 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -23,12 +23,12 @@ using CreateReadBuffer = std::function()>; void copyAzureBlobStorageFile( std::shared_ptr src_client, std::shared_ptr dest_client, - const String & src_bucket, - const String & src_key, + const String & src_container, + const String & src_path, size_t src_offset, size_t src_size, - const String & dest_bucket, - const String & dest_key, + const String & dest_container, + const String & dest_path, std::shared_ptr settings, const ReadSettings & read_settings, const std::optional> & object_metadata = std::nullopt, @@ -46,8 +46,8 @@ void copyDataToAzureBlobStorageFile( size_t offset, size_t size, std::shared_ptr & client, + const String & dest_container, const String & dest_bucket, - const String & dest_key, std::shared_ptr settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, From 91bad5bc39963e9450f284dfc6b45fd69fa146de Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 4 Jan 2024 16:06:36 +0100 Subject: [PATCH 048/884] Updated to use MultiVersion for BlobContainerClient in Backups and updated to get client from disk --- src/Backups/BackupIO_AzureBlobStorage.cpp | 72 +++++-------------- src/Backups/BackupIO_AzureBlobStorage.h | 4 +- .../AzureBlobStorage/AzureObjectStorage.h | 5 ++ .../copyAzureBlobStorageFile.cpp | 20 +++--- .../copyAzureBlobStorageFile.h | 6 +- 5 files changed, 37 insertions(+), 70 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index de40fc6b33b..968a60c566f 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -27,8 +27,6 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; } -//using AzureClientPtr = std::shared_ptr; - BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( StorageAzureBlob::Configuration configuration_, const ReadSettings & read_settings_, @@ -38,12 +36,13 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} , configuration(configuration_) { - client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupReaderAzureBlobStorage", - std::make_unique(*client.get()), + std::move(client_ptr), std::move(settings_as_unique_ptr)); + client = object_storage->getClient(); } BackupReaderAzureBlobStorage::~BackupReaderAzureBlobStorage() = default; @@ -89,7 +88,7 @@ std::unique_ptr BackupReaderAzureBlobStorage::readFile(const key = file_name; } return std::make_unique( - client, key, read_settings, settings->max_single_read_retries, + client.get(), key, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); } @@ -113,23 +112,9 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, "Blob writing function called with unexpected blob_path.size={} or mode={}", blob_path.size(), mode); - std::shared_ptr dest_client; - if (configuration.container == blob_path[1]) - { - dest_client = client; - } - else - { - StorageAzureBlob::Configuration dest_configuration = configuration; - dest_configuration.container = blob_path[1]; - dest_configuration.blob_path = blob_path[0]; - dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); - } - - copyAzureBlobStorageFile( client, - dest_client, + reinterpret_cast(destination_disk->getObjectStorage().get())->getClient(), configuration.container, fs::path(configuration.blob_path) / path_in_backup, 0, @@ -163,12 +148,13 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( , data_source_description{DataSourceType::AzureBlobStorage,configuration_.container, false, false} , configuration(configuration_) { - client = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); + auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupWriterAzureBlobStorage", - std::make_unique(*client.get()), - std::move(settings_as_unique_ptr)); + std::move(client_ptr), + std::move(settings_as_unique_ptr)); + client = object_storage->getClient(); } void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -182,23 +168,9 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu /// In this case we can't use the native copy. if (auto blob_path = src_disk->getBlobPath(src_path); blob_path.size() == 2) { - - std::shared_ptr src_client; - if (configuration.container == blob_path[1]) - { - src_client = client; - } - else - { - StorageAzureBlob::Configuration src_configuration = configuration; - src_configuration.container = blob_path[1]; - src_configuration.blob_path = blob_path[0]; - src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); - } - LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); copyAzureBlobStorageFile( - src_client, + reinterpret_cast(src_disk->getObjectStorage().get())->getClient(), client, /* src_container */ blob_path[1], /* src_path */ blob_path[0], @@ -220,26 +192,16 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu void BackupWriterAzureBlobStorage::copyFile(const String & destination, const String & source, size_t size) { - std::shared_ptr src_client; - std::shared_ptr dest_client; - StorageAzureBlob::Configuration src_configuration = configuration; - src_configuration.container = source; - src_client = StorageAzureBlob::createClient(src_configuration, /* is_read_only */ false); - - StorageAzureBlob::Configuration dest_configuration = configuration; - dest_configuration.container = destination; - dest_client = StorageAzureBlob::createClient(dest_configuration, /* is_read_only */ false); - LOG_TRACE(log, "Copying file inside backup from {} to {} ", source, destination); copyAzureBlobStorageFile( - src_client, - dest_client, + client, + client, configuration.container, - fs::path(configuration.blob_path), + fs::path(source), 0, size, - /* dest_container */ destination, - /* dest_path */ configuration.blob_path, + /* dest_container */ configuration.container, + /* dest_path */ destination, settings, read_settings, {}, @@ -303,7 +265,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String } return std::make_unique( - client, key, read_settings, settings->max_single_read_retries, + client.get(), key, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); } @@ -319,7 +281,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin key = file_name; } return std::make_unique( - client, + client.get(), key, settings->max_single_part_upload_size, DBMS_DEFAULT_BUFFER_SIZE, diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 87a6c3ef675..12bf073cd08 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -28,7 +28,7 @@ public: private: const DataSourceDescription data_source_description; - std::shared_ptr client; + MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; std::shared_ptr settings; @@ -57,7 +57,7 @@ private: std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; void removeFilesBatch(const Strings & file_names); const DataSourceDescription data_source_description; - std::shared_ptr client; + MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; std::shared_ptr settings; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 55c81b4b7d9..1ff4537742f 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -134,6 +134,11 @@ public: bool isRemote() const override { return true; } + MultiVersion & getClient() + { + return client; + } + private: const String name; /// client used to access the files in the Blob Storage cloud diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index df1341efdd1..4ec90d2830e 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -43,7 +43,7 @@ namespace public: UploadHelper( const CreateReadBuffer & create_read_buffer_, - std::shared_ptr client_, + MultiVersion & client_, size_t offset_, size_t total_size_, const String & dest_container_, @@ -72,7 +72,7 @@ namespace protected: std::function()> create_read_buffer; - std::shared_ptr client; + MultiVersion & client; size_t offset; size_t total_size; const String & dest_container; @@ -170,7 +170,7 @@ namespace void completeMultipartUpload() { - auto block_blob_client = client->GetBlockBlobClient(dest_blob); + auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); block_blob_client.CommitBlockList(block_ids); } @@ -295,7 +295,7 @@ namespace if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); - auto block_blob_client = client->GetBlockBlobClient(dest_blob); + auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); task.block_id = getRandomASCIIString(64); Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); block_blob_client.StageBlock(task.block_id, memory); @@ -329,7 +329,7 @@ void copyDataToAzureBlobStorageFile( const std::function()> & create_read_buffer, size_t offset, size_t size, - std::shared_ptr & dest_client, + MultiVersion & dest_client, const String & dest_container, const String & dest_blob, std::shared_ptr settings, @@ -343,8 +343,8 @@ void copyDataToAzureBlobStorageFile( void copyAzureBlobStorageFile( - std::shared_ptr src_client, - std::shared_ptr dest_client, + MultiVersion & src_client, + MultiVersion & dest_client, const String & src_container, const String & src_blob, size_t offset, @@ -363,8 +363,8 @@ void copyAzureBlobStorageFile( ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); - auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); + auto block_blob_client_src = src_client.get()->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client.get()->GetBlockBlobClient(dest_blob); auto uri = block_blob_client_src.GetUrl(); block_blob_client_dest.CopyFromUri(uri); } @@ -373,7 +373,7 @@ void copyAzureBlobStorageFile( LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client, src_blob, read_settings, settings->max_single_read_retries, + return std::make_unique(src_client.get(), src_blob, read_settings, settings->max_single_read_retries, settings->max_single_download_retries); }; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 059d0318f57..a6502541db1 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -21,8 +21,8 @@ using CreateReadBuffer = std::function()>; /// Copies a file from AzureBlobStorage to AzureBlobStorage. /// The parameters `src_offset` and `src_size` specify a part in the source to copy. void copyAzureBlobStorageFile( - std::shared_ptr src_client, - std::shared_ptr dest_client, + MultiVersion & src_client, + MultiVersion & dest_client, const String & src_container, const String & src_path, size_t src_offset, @@ -45,7 +45,7 @@ void copyDataToAzureBlobStorageFile( const std::function()> & create_read_buffer, size_t offset, size_t size, - std::shared_ptr & client, + MultiVersion & client, const String & dest_container, const String & dest_bucket, std::shared_ptr settings, From c14605caa7f403531a6ff0663c242aa5d466ab07 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 4 Jan 2024 18:27:54 +0100 Subject: [PATCH 049/884] Added flag use_native_copy and updated to use StartCopyFromUri for native copy with large files --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 3 ++- .../AzureBlobStorage/AzureObjectStorage.h | 10 +++---- .../copyAzureBlobStorageFile.cpp | 26 ++++++++++++++++--- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 9e703d6fc5e..e29def06363 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -167,7 +167,8 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getInt(config_prefix + ".list_object_keys_size", 1000), config.getUInt64(config_prefix + ".min_upload_part_size", 16 * 1024 * 1024), config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), - config.getUInt64(config_prefix + ".max_part_number", 10000) + config.getUInt64(config_prefix + ".max_part_number", 10000), + config.getBool(config_prefix + ".use_native_copy", false) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 1ff4537742f..436b48c0ad4 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -27,7 +27,8 @@ struct AzureObjectStorageSettings int list_object_keys_size_, size_t min_upload_part_size_, size_t max_upload_part_size_, - size_t max_part_number_) + size_t max_part_number_, + bool use_native_copy_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) @@ -36,6 +37,7 @@ struct AzureObjectStorageSettings , min_upload_part_size(min_upload_part_size_) , max_upload_part_size(max_upload_part_size_) , max_part_number(max_part_number_) + , use_native_copy(use_native_copy_) { } @@ -49,6 +51,7 @@ struct AzureObjectStorageSettings size_t min_upload_part_size = 16 * 1024 * 1024; size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; size_t max_part_number = 10000; + bool use_native_copy = false; }; using AzureClient = Azure::Storage::Blobs::BlobContainerClient; @@ -134,10 +137,7 @@ public: bool isRemote() const override { return true; } - MultiVersion & getClient() - { - return client; - } + MultiVersion & getClient() { return client; } private: const String name; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 4ec90d2830e..9db5ddb476a 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -30,6 +30,7 @@ namespace ErrorCodes { extern const int LOGICAL_ERROR; extern const int INVALID_CONFIG_PARAMETER; + extern const int AZURE_BLOB_STORAGE_ERROR; } @@ -358,15 +359,34 @@ void copyAzureBlobStorageFile( bool for_disk_azure_blob_storage) { - if (size < max_single_operation_copy_size) + if (settings->use_native_copy ) { ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); + auto block_blob_client_src = src_client.get()->GetBlockBlobClient(src_blob); auto block_blob_client_dest = dest_client.get()->GetBlockBlobClient(dest_blob); - auto uri = block_blob_client_src.GetUrl(); - block_blob_client_dest.CopyFromUri(uri); + auto source_uri = block_blob_client_src.GetUrl(); + + if (size < max_single_operation_copy_size) + { + block_blob_client_dest.CopyFromUri(source_uri); + } + else + { + Azure::Storage::Blobs::StartBlobCopyOperation operation = block_blob_client_dest.StartCopyFromUri(source_uri); + + // Wait for the operation to finish, checking for status every 100 second. + auto copy_response = operation.PollUntilDone(std::chrono::milliseconds(100)); + auto properties_model = copy_response.Value; + + if (properties_model.CopySource.HasValue()) + { + throw Exception(ErrorCodes::AZURE_BLOB_STORAGE_ERROR, "Copy failed"); + } + + } } else { From 2ee68933123583fe585093868e65c3562d36d66a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 5 Jan 2024 10:58:04 +0100 Subject: [PATCH 050/884] Updated to return container for getObjectsNamespace --- src/Backups/BackupIO_AzureBlobStorage.cpp | 6 ++++-- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp | 7 +++++-- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 6 ++++-- .../AzureBlobStorage/registerDiskAzureBlobStorage.cpp | 4 +++- src/Storages/StorageAzureBlob.cpp | 2 +- src/TableFunctions/TableFunctionAzureBlobStorage.cpp | 4 ++-- .../TableFunctionAzureBlobStorageCluster.cpp | 4 ++-- 7 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 968a60c566f..5ddbb42e2c0 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -41,7 +41,8 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr)); + std::move(settings_as_unique_ptr), + configuration_.container); client = object_storage->getClient(); } @@ -153,7 +154,8 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupWriterAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr)); + std::move(settings_as_unique_ptr), + configuration_.container); client = object_storage->getClient(); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp index 068e2aebab1..1f92ef48350 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.cpp @@ -92,10 +92,12 @@ private: AzureObjectStorage::AzureObjectStorage( const String & name_, AzureClientPtr && client_, - SettingsPtr && settings_) + SettingsPtr && settings_, + const String & container_) : name(name_) , client(std::move(client_)) , settings(std::move(settings_)) + , container(container_) , log(&Poco::Logger::get("AzureObjectStorage")) { data_source_description.type = DataSourceType::AzureBlobStorage; @@ -379,7 +381,8 @@ std::unique_ptr AzureObjectStorage::cloneObjectStorage(const std return std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), - getAzureBlobStorageSettings(config, config_prefix, context) + getAzureBlobStorageSettings(config, config_prefix, context), + container ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 436b48c0ad4..660d4a30889 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -66,7 +66,8 @@ public: AzureObjectStorage( const String & name_, AzureClientPtr && client_, - SettingsPtr && settings_); + SettingsPtr && settings_, + const String & container_); void listObjects(const std::string & path, RelativePathsWithMetadata & children, int max_keys) const override; @@ -125,7 +126,7 @@ public: const std::string & config_prefix, ContextPtr context) override; - String getObjectsNamespace() const override { return ""; } + String getObjectsNamespace() const override { return container ; } std::unique_ptr cloneObjectStorage( const std::string & new_namespace, @@ -144,6 +145,7 @@ private: /// client used to access the files in the Blob Storage cloud MultiVersion client; MultiVersion settings; + const String container; Poco::Logger * log; diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp index 7ba9d21db62..2ffd910f92a 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/registerDiskAzureBlobStorage.cpp @@ -26,10 +26,12 @@ void registerDiskAzureBlobStorage(DiskFactory & factory, bool global_skip_access { auto [metadata_path, metadata_disk] = prepareForLocalMetadata(name, config, config_prefix, context); + String container_name = config.getString(config_prefix + ".container_name", "default-container"); ObjectStoragePtr azure_object_storage = std::make_unique( name, getAzureBlobContainerClient(config, config_prefix), - getAzureBlobStorageSettings(config, config_prefix, context)); + getAzureBlobStorageSettings(config, config_prefix, context), + container_name); String key_prefix; auto metadata_storage = std::make_shared(metadata_disk, key_prefix); diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index f1070c8c31e..fcd7074b9d2 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -314,7 +314,7 @@ void registerStorageAzureBlob(StorageFactory & factory) return std::make_shared( std::move(configuration), - std::make_unique("AzureBlobStorage", std::move(client), std::move(settings)), + std::make_unique("AzureBlobStorage", std::move(client), std::move(settings),configuration.container), args.getContext(), args.table_id, args.columns, diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index d394c836369..b098cac5144 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -262,7 +262,7 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex auto client = StorageAzureBlob::createClient(configuration, !is_insert_query); auto settings = StorageAzureBlob::createSettings(context); - auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); + auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings), configuration.container); return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context, false); } @@ -293,7 +293,7 @@ StoragePtr TableFunctionAzureBlobStorage::executeImpl(const ASTPtr & /*ast_funct StoragePtr storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), context, StorageID(getDatabaseName(), table_name), columns, diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp index eee585967c2..1c3b302a186 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp @@ -40,7 +40,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( /// On worker node this filename won't contains globs storage = std::make_shared( configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), context, StorageID(getDatabaseName(), table_name), columns, @@ -55,7 +55,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( storage = std::make_shared( cluster_name, configuration, - std::make_unique(table_name, std::move(client), std::move(settings)), + std::make_unique(table_name, std::move(client), std::move(settings), configuration.container), StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, From b250acff789620be57e21977d8f3d4a3468070d5 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Fri, 5 Jan 2024 11:26:32 +0100 Subject: [PATCH 051/884] Fixed style check --- src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 9db5ddb476a..3399f1705f4 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -359,7 +359,7 @@ void copyAzureBlobStorageFile( bool for_disk_azure_blob_storage) { - if (settings->use_native_copy ) + if (settings->use_native_copy) { ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) From 356fc0aadb8f7c0f15f72c3b72955e1db7046e48 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 7 Jan 2024 14:49:24 +0100 Subject: [PATCH 052/884] Fix tests --- src/Storages/StorageView.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index 2f7267e3701..1898e49de86 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,8 +112,14 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - if (is_parameterized_view_ && !query.isParameterizedView()) + if (is_parameterized_view_) + { + if (!query.isParameterizedView()) + storage_metadata.setColumns(columns_); + } + else storage_metadata.setColumns(columns_); + storage_metadata.setComment(comment); if (!query.select) From fd92c1961e5f09411d83b21c4fe9f00b78be22ba Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 7 Jan 2024 16:33:48 +0100 Subject: [PATCH 053/884] Fix clang tidy build --- src/Backups/BackupIO_AzureBlobStorage.cpp | 12 ++++++------ src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp | 10 +++++----- src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h | 6 +++--- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 5ddbb42e2c0..8c6c1040eec 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -305,21 +305,21 @@ void BackupWriterAzureBlobStorage::removeFile(const String & file_name) object_storage->removeObjectIfExists(object); } -void BackupWriterAzureBlobStorage::removeFiles(const Strings & keys) +void BackupWriterAzureBlobStorage::removeFiles(const Strings & file_names) { StoredObjects objects; - for (const auto & key : keys) - objects.emplace_back(key); + for (const auto & file_name : file_names) + objects.emplace_back(file_name); object_storage->removeObjectsIfExist(objects); } -void BackupWriterAzureBlobStorage::removeFilesBatch(const Strings & keys) +void BackupWriterAzureBlobStorage::removeFilesBatch(const Strings & file_names) { StoredObjects objects; - for (const auto & key : keys) - objects.emplace_back(key); + for (const auto & file_name : file_names) + objects.emplace_back(file_name); object_storage->removeObjectsIfExist(objects); } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 3399f1705f4..272be914cc1 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -65,11 +65,11 @@ namespace , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) , log(log_) - , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) + , max_single_part_upload_size(settings_->max_single_part_upload_size) { } - ~UploadHelper() {} + virtual ~UploadHelper() = default; protected: std::function()> create_read_buffer; @@ -114,9 +114,9 @@ namespace if (!total_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - auto max_part_number = settings.get()->max_part_number; - auto min_upload_part_size = settings.get()->min_upload_part_size; - auto max_upload_part_size = settings.get()->max_upload_part_size; + auto max_part_number = settings->max_part_number; + auto min_upload_part_size = settings->min_upload_part_size; + auto max_upload_part_size = settings->max_upload_part_size; if (!max_part_number) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index a6502541db1..b022151d32d 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -24,11 +24,11 @@ void copyAzureBlobStorageFile( MultiVersion & src_client, MultiVersion & dest_client, const String & src_container, - const String & src_path, + const String & src_blob, size_t src_offset, size_t src_size, const String & dest_container, - const String & dest_path, + const String & dest_blob, std::shared_ptr settings, const ReadSettings & read_settings, const std::optional> & object_metadata = std::nullopt, @@ -47,7 +47,7 @@ void copyDataToAzureBlobStorageFile( size_t size, MultiVersion & client, const String & dest_container, - const String & dest_bucket, + const String & dest_blob, std::shared_ptr settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, From f50f7f56949021d01ba692f6788e50d411ca8af9 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 8 Jan 2024 14:25:33 +0100 Subject: [PATCH 054/884] Removed unwanted includes --- .../registerBackupEngineAzureBlobStorage.cpp | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index ef95206831f..810da5adb3f 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -10,13 +10,11 @@ #include #include #include -#include #endif namespace DB { -namespace fs = std::filesystem; namespace ErrorCodes { @@ -25,23 +23,6 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } -#if USE_AZURE_BLOB_STORAGE -namespace -{ - String removeFileNameFromURL(String & url) - { - Poco::URI url2{url}; - String path = url2.getPath(); - size_t slash_pos = path.find_last_of('/'); - String file_name = path.substr(slash_pos + 1); - path.resize(slash_pos + 1); - url2.setPath(path); - url = url2.toString(); - return file_name; - } -} -#endif - void registerBackupEngineAzureBlobStorage(BackupFactory & factory) { From 2d914721e5101215c2c63c97151552cb7c8ff746 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 8 Jan 2024 15:10:37 +0100 Subject: [PATCH 055/884] Fix build --- .../registerBackupEngineAzureBlobStorage.cpp | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/Backups/registerBackupEngineAzureBlobStorage.cpp b/src/Backups/registerBackupEngineAzureBlobStorage.cpp index 810da5adb3f..3480ea75f1f 100644 --- a/src/Backups/registerBackupEngineAzureBlobStorage.cpp +++ b/src/Backups/registerBackupEngineAzureBlobStorage.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #endif @@ -23,6 +24,22 @@ namespace ErrorCodes extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH; } +#if USE_AZURE_BLOB_STORAGE +namespace +{ + String removeFileNameFromURL(String & url) + { + Poco::URI url2{url}; + String path = url2.getPath(); + size_t slash_pos = path.find_last_of('/'); + String file_name = path.substr(slash_pos + 1); + path.resize(slash_pos + 1); + url2.setPath(path); + url = url2.toString(); + return file_name; + } +} +#endif void registerBackupEngineAzureBlobStorage(BackupFactory & factory) { From c5bf722ee2d2b50d1b0691112b769e3e67612214 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Mon, 8 Jan 2024 21:24:44 +0300 Subject: [PATCH 056/884] Create ch/chc/chl symlinks by cmake as well (for develop mode) Before, they had been created only by install target. Follow-up for: #56634 Signed-off-by: Azat Khuzhin --- programs/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/programs/CMakeLists.txt b/programs/CMakeLists.txt index b3a5af6d6c9..6e544bac81c 100644 --- a/programs/CMakeLists.txt +++ b/programs/CMakeLists.txt @@ -328,6 +328,10 @@ set (CLICKHOUSE_BUNDLE) if (ENABLE_CLICKHOUSE_SELF_EXTRACTING) list(APPEND CLICKHOUSE_BUNDLE self-extracting) endif () + +if (NOT BUILD_STANDALONE_KEEPER) + add_custom_target (ch ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse ch DEPENDS clickhouse) +endif() if (ENABLE_CLICKHOUSE_SERVER) add_custom_target (clickhouse-server ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-server DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-server" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) @@ -335,11 +339,13 @@ if (ENABLE_CLICKHOUSE_SERVER) endif () if (ENABLE_CLICKHOUSE_CLIENT) add_custom_target (clickhouse-client ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-client DEPENDS clickhouse) + add_custom_target (chc ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse chc DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-client" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) list(APPEND CLICKHOUSE_BUNDLE clickhouse-client) endif () if (ENABLE_CLICKHOUSE_LOCAL) add_custom_target (clickhouse-local ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse clickhouse-local DEPENDS clickhouse) + add_custom_target (chl ALL COMMAND ${CMAKE_COMMAND} -E create_symlink clickhouse chl DEPENDS clickhouse) install (FILES "${CMAKE_CURRENT_BINARY_DIR}/clickhouse-local" DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT clickhouse) list(APPEND CLICKHOUSE_BUNDLE clickhouse-local) endif () From 629d4b921e5cf2d709d2ca7a55658d95407e2ff7 Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 9 Jan 2024 15:38:04 +0000 Subject: [PATCH 057/884] Fix style --- src/Analyzer/Passes/IfConstantConditionPass.cpp | 2 +- src/Storages/StorageMerge.cpp | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Analyzer/Passes/IfConstantConditionPass.cpp b/src/Analyzer/Passes/IfConstantConditionPass.cpp index f3b8b712dbf..6b24eb1d539 100644 --- a/src/Analyzer/Passes/IfConstantConditionPass.cpp +++ b/src/Analyzer/Passes/IfConstantConditionPass.cpp @@ -57,7 +57,7 @@ public: } -void IfConstantConditionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) +void IfConstantConditionPass::run(QueryTreeNodePtr & query_tree_node, ContextPtr context) { IfConstantConditionVisitor visitor(std::move(context)); visitor.visit(query_tree_node); diff --git a/src/Storages/StorageMerge.cpp b/src/Storages/StorageMerge.cpp index 15ca6e65482..ffbf98e85c7 100644 --- a/src/Storages/StorageMerge.cpp +++ b/src/Storages/StorageMerge.cpp @@ -92,7 +92,6 @@ namespace ErrorCodes extern const int SAMPLING_NOT_SUPPORTED; extern const int ALTER_OF_COLUMN_IS_FORBIDDEN; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; - extern const int LOGICAL_ERROR; } StorageMerge::DatabaseNameOrRegexp::DatabaseNameOrRegexp( From c30736d415fcdaccb68a1c0e37e8c4de9242e014 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 12 Jan 2024 15:31:15 +0000 Subject: [PATCH 058/884] Cosmetics --- src/Storages/MergeTree/MutateTask.cpp | 8 +-- ...mn_must_not_override_past_values.reference | 33 ++++++++++++ ...e_column_must_not_override_past_values.sql | 53 +++++++++++++++++++ ..._column_not_override_past_values.reference | 29 ---------- ...ialize_column_not_override_past_values.sql | 49 ----------------- 5 files changed, 90 insertions(+), 82 deletions(-) create mode 100644 tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference create mode 100644 tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql delete mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference delete mode 100644 tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql diff --git a/src/Storages/MergeTree/MutateTask.cpp b/src/Storages/MergeTree/MutateTask.cpp index bb41608eb00..25fa45e7b68 100644 --- a/src/Storages/MergeTree/MutateTask.cpp +++ b/src/Storages/MergeTree/MutateTask.cpp @@ -76,8 +76,8 @@ static void splitAndModifyMutationCommands( { if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) { - /// For ordinary column with default expression, materialize column should not override past values - /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) { @@ -206,8 +206,8 @@ static void splitAndModifyMutationCommands( { if (command.type == MutationCommand::Type::MATERIALIZE_COLUMN) { - /// For ordinary column with default expression, materialize column should not override past values - /// So we only mutated column if `command.column_name` is a materialized column or if the part does not have physical column file + /// For ordinary column with default or materialized expression, MATERIALIZE COLUMN should not override past values + /// So we only mutate column if `command.column_name` is a default/materialized column or if the part does not have physical column file auto column_ordinary = table_columns.getOrdinary().tryGetByName(command.column_name); if (!column_ordinary || !part->tryGetColumn(command.column_name) || !part->hasColumnFiles(*column_ordinary)) for_interpreter.push_back(command); diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference new file mode 100644 index 00000000000..a5a0370620b --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.reference @@ -0,0 +1,33 @@ +-- Compact parts +Origin +1 2 +2 54321 +After materialize +1 2 +2 54321 +-- Wide parts +Origin +1 2 +2 54321 +After materialize +1 2 +2 54321 +-- Nullable column != physically absent +Origin +1 2 +2 \N +3 54321 +After materialize +1 2 +2 \N +3 54321 +-- Parts with renamed column +Origin +1 2 +2 54321 +After rename +1 2 +2 54321 +After materialize +1 2 +2 54321 diff --git a/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql new file mode 100644 index 00000000000..825c7eab048 --- /dev/null +++ b/tests/queries/0_stateless/02946_materialize_column_must_not_override_past_values.sql @@ -0,0 +1,53 @@ +SET mutations_sync = 2; + +SELECT '-- Compact parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id) VALUES (2); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Wide parts'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id) VALUES (2); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Nullable column != physically absent'; + +CREATE TABLE tab (id Int64, dflt Nullable(Int64) DEFAULT 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id, dflt) VALUES (2, NULL); +INSERT INTO tab (id) VALUES (3); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN dflt; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; + +SELECT '-- Parts with renamed column'; + +CREATE TABLE tab (id Int64, dflt Int64 DEFAULT 54321) ENGINE MergeTree ORDER BY id; +INSERT INTO tab (id, dflt) VALUES (1, 2); +INSERT INTO tab (id) VALUES (2); +SELECT 'Origin'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab RENAME COLUMN dflt TO dflt2; +SELECT 'After rename'; +SELECT * FROM tab ORDER BY id; +ALTER TABLE tab MATERIALIZE COLUMN bar; +SELECT 'After materialize'; +SELECT * FROM tab ORDER BY id; +DROP TABLE tab; diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference deleted file mode 100644 index 6b0d88bd09b..00000000000 --- a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.reference +++ /dev/null @@ -1,29 +0,0 @@ ---Origin-- -1 2 -2 54321 ---After materialize-- -1 2 -2 54321 ---Origin-- -1 2 -2 54321 ---After materialize-- -1 2 -2 54321 ---Origin-- -1 2 -2 \N -3 54321 ---After materialize-- -1 2 -2 \N -3 54321 ---Origin-- -1 2 -2 54321 ---After rename-- -1 2 -2 54321 ---After materialize-- -1 2 -2 54321 diff --git a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql b/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql deleted file mode 100644 index 1815661e097..00000000000 --- a/tests/queries/0_stateless/02946_materialize_column_not_override_past_values.sql +++ /dev/null @@ -1,49 +0,0 @@ - -SET mutations_sync = 2; --- Compact parts -CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id ) values ( 2 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN foo; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; - --- Wide parts -CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id ) values ( 2 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN foo; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; - --- Nullable column != physically absent -CREATE TABLE test (id Int64, foo Nullable(Int64) default 54321) ENGINE MergeTree ORDER BY id SETTINGS min_bytes_for_wide_part = 1; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id, foo ) values ( 2, NULL ); -INSERT INTO test ( id ) values ( 3 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN foo; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; - --- Parts with renamed column -CREATE TABLE test (id Int64, foo Int64 default 54321) ENGINE MergeTree ORDER BY id; -INSERT INTO test ( id, foo ) values ( 1, 2 ); -INSERT INTO test ( id ) values ( 2 ); -SELECT '--Origin--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test RENAME COLUMN foo TO bar; -SELECT '--After rename--'; -SELECT * FROM test ORDER BY id; -ALTER TABLE test MATERIALIZE COLUMN bar; -SELECT '--After materialize--'; -SELECT * FROM test ORDER BY id; -DROP TABLE test; \ No newline at end of file From ffde721f08359e0437c44026881e2514012a4966 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Jan 2024 23:09:10 +0300 Subject: [PATCH 059/884] Update 02932_set_ttl_where.sql --- tests/queries/0_stateless/02932_set_ttl_where.sql | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/queries/0_stateless/02932_set_ttl_where.sql b/tests/queries/0_stateless/02932_set_ttl_where.sql index bf2b317c4bf..ee8473e1af2 100644 --- a/tests/queries/0_stateless/02932_set_ttl_where.sql +++ b/tests/queries/0_stateless/02932_set_ttl_where.sql @@ -1,3 +1,5 @@ +-- Tags: no-ordinary-database + create or replace table t_temp ( a UInt32, timestamp DateTime @@ -12,3 +14,5 @@ select sleep(1); insert into t_temp select rand(), now() from system.numbers limit 1_000_000; select sleep(1); optimize table t_temp final; + +DROP TABLE t_temp; From 12585ea0e4cae1771ee6b51dd85a309e5923f12c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 13 Jan 2024 23:10:27 +0300 Subject: [PATCH 060/884] Update TTLDescription.cpp --- src/Storages/TTLDescription.cpp | 25 ------------------------- 1 file changed, 25 deletions(-) diff --git a/src/Storages/TTLDescription.cpp b/src/Storages/TTLDescription.cpp index bfd3afc30d8..3db5269b617 100644 --- a/src/Storages/TTLDescription.cpp +++ b/src/Storages/TTLDescription.cpp @@ -117,11 +117,6 @@ TTLDescription::TTLDescription(const TTLDescription & other) , if_exists(other.if_exists) , recompression_codec(other.recompression_codec) { - // if (other.expression) - // expression = other.expression->clone(); - - // if (other.where_expression) - // where_expression = other.where_expression->clone(); } TTLDescription & TTLDescription::operator=(const TTLDescription & other) @@ -135,11 +130,6 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else expression_ast.reset(); - // if (other.expression) - // expression = other.expression->clone(); - // else - // expression.reset(); - expression_columns = other.expression_columns; result_column = other.result_column; @@ -148,11 +138,6 @@ TTLDescription & TTLDescription::operator=(const TTLDescription & other) else where_expression_ast.reset(); - // if (other.where_expression) - // where_expression = other.where_expression->clone(); - // else - // where_expression.reset(); - where_expression_columns = other.where_expression_columns; where_result_column = other.where_result_column; group_by_keys = other.group_by_keys; @@ -179,7 +164,6 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType auto dag = analyzer.getActionsDAG(false); const auto * col = &dag->findInOutputs(ast->getColumnName()); - // std::cerr << "buildExpressionAndSets " << ttl_string << std::endl; if (col->result_name != ttl_string) col = &dag->addAlias(*col, ttl_string); @@ -189,10 +173,6 @@ static ExpressionAndSets buildExpressionAndSets(ASTPtr & ast, const NamesAndType result.expression = std::make_shared(dag, ExpressionActionsSettings::fromContext(context)); result.sets = analyzer.getPreparedSets(); - // std::cerr << "--------- buildExpressionAndSets\n"; - // std::cerr << result.expression->dumpActions() << std::endl; - // std::cerr << result.sets->getSubqueries().size() << std::endl; - return result; } @@ -232,8 +212,6 @@ TTLDescription TTLDescription::getTTLFromAST( auto expression = buildExpressionAndSets(ttl_ast, columns.getAllPhysical(), context).expression; result.expression_columns = expression->getRequiredColumnsWithTypes(); - // auto syntax_analyzer_result = TreeRewriter(context).analyze(ttl_ast, columns.getAllPhysical()); - // result.expression = ExpressionAnalyzer(ttl_ast, syntax_analyzer_result, context).getActions(false); result.result_column = expression->getSampleBlock().safeGetByPosition(0).name; ExpressionActionsPtr where_expression; @@ -256,9 +234,6 @@ TTLDescription TTLDescription::getTTLFromAST( { result.where_expression_ast = where_expr_ast->clone(); where_expression = buildExpressionAndSets(where_expr_ast, columns.getAllPhysical(), context).expression; - // auto where_syntax_result = TreeRewriter(context).analyze(where_expr_ast, columns.getAllPhysical()); - // result.where_expression = ExpressionAnalyzer(where_expr_ast, where_syntax_result, context).getActions(false); - result.where_expression_columns = where_expression->getRequiredColumnsWithTypes(); result.where_result_column = where_expression->getSampleBlock().safeGetByPosition(0).name; } From 776ea26ce71287735897b00c65b47d73e8d9811c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 02:45:51 +0300 Subject: [PATCH 061/884] Update PreparedSets.h --- src/Interpreters/PreparedSets.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Interpreters/PreparedSets.h b/src/Interpreters/PreparedSets.h index 30bfda4700d..4f5ca337c5b 100644 --- a/src/Interpreters/PreparedSets.h +++ b/src/Interpreters/PreparedSets.h @@ -115,7 +115,6 @@ public: SetPtr buildSetInplace(const ContextPtr & context); std::unique_ptr build(const ContextPtr & context); - void buildSetInplace(const ContextPtr & context); QueryTreeNodePtr detachQueryTree() { return std::move(query_tree); } void setQueryPlan(std::unique_ptr source_); From 1afc5e8c01685d1bb3e86b5a0fff55618db517b0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 03:45:42 +0100 Subject: [PATCH 062/884] Enable coverage for debug build --- docker/packager/packager | 8 ++++++++ tests/ci/build_check.py | 2 ++ tests/ci/ci_config.py | 2 ++ 3 files changed, 12 insertions(+) diff --git a/docker/packager/packager b/docker/packager/packager index ade36a55591..4c443896f4a 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -121,6 +121,7 @@ def is_release_build(debug_build: bool, package_type: str, sanitizer: str) -> bo def parse_env_variables( debug_build: bool, + coverage: bool, compiler: str, sanitizer: str, package_type: str, @@ -287,6 +288,9 @@ def parse_env_variables( else: result.append("BUILD_TYPE=None") + if coverage: + result.append("SANITIZE_COVERAGE=1") + if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") @@ -415,6 +419,9 @@ def parse_args() -> argparse.Namespace: choices=("address", "thread", "memory", "undefined", ""), default="", ) + parser.add_argument( + "--coverage", action="store_true", help="enable granular coverage with introspection" + ) parser.add_argument("--clang-tidy", action="store_true") parser.add_argument( @@ -507,6 +514,7 @@ def main() -> None: env_prepared = parse_env_variables( args.debug_build, + args.coverage, args.compiler, args.sanitizer, args.package_type, diff --git a/tests/ci/build_check.py b/tests/ci/build_check.py index 27243aac4f1..fe4308f5933 100644 --- a/tests/ci/build_check.py +++ b/tests/ci/build_check.py @@ -73,6 +73,8 @@ def get_packager_cmd( cmd += " --debug-build" if build_config.sanitizer: cmd += f" --sanitizer={build_config.sanitizer}" + if build_config.coverage: + cmd += " --coverage" if build_config.tidy: cmd += " --clang-tidy" diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index e3319fe4a72..b8dff3f0a28 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -62,6 +62,7 @@ class BuildConfig: package_type: Literal["deb", "binary", "fuzzers"] additional_pkgs: bool = False debug_build: bool = False + coverage: bool = False sanitizer: str = "" tidy: bool = False sparse_checkout: bool = False @@ -473,6 +474,7 @@ CI_CONFIG = CiConfig( name="package_debug", compiler="clang-17", debug_build=True, + coverage=True, package_type="deb", sparse_checkout=True, ), From 0219d58d925bd3f7901f9251c2abca76c1ae00dc Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Sun, 14 Jan 2024 02:56:50 +0000 Subject: [PATCH 063/884] Automatic style fix --- docker/packager/packager | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 4c443896f4a..3e7f1ba447e 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -420,7 +420,9 @@ def parse_args() -> argparse.Namespace: default="", ) parser.add_argument( - "--coverage", action="store_true", help="enable granular coverage with introspection" + "--coverage", + action="store_true", + help="enable granular coverage with introspection", ) parser.add_argument("--clang-tidy", action="store_true") From 6405decbb0ad0e80fe20b22a9956481abbe3b479 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 05:14:54 +0100 Subject: [PATCH 064/884] Fix Python --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index 4c443896f4a..2e2b6550636 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -289,7 +289,7 @@ def parse_env_variables( result.append("BUILD_TYPE=None") if coverage: - result.append("SANITIZE_COVERAGE=1") + cmake_flags.append("-DSANITIZE_COVERAGE=1") if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") From 9f5a7c51175dc3d4cfe46065b4912e7973a30983 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 08:05:57 +0100 Subject: [PATCH 065/884] Fix error --- cmake/sanitize.cmake | 1 + contrib/jemalloc-cmake/CMakeLists.txt | 3 +++ 2 files changed, 4 insertions(+) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 3f7a8498059..3882b51227e 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -82,3 +82,4 @@ if (SANITIZE_COVERAGE) endif() set (WITHOUT_COVERAGE_FLAGS "-fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table") +set (WITHOUT_COVERAGE_FLAGS_LIST -fno-profile-instr-generate -fno-coverage-mapping -fno-sanitize-coverage=trace-pc-guard,pc-table) diff --git a/contrib/jemalloc-cmake/CMakeLists.txt b/contrib/jemalloc-cmake/CMakeLists.txt index 15e965ed841..f85a38dcf8a 100644 --- a/contrib/jemalloc-cmake/CMakeLists.txt +++ b/contrib/jemalloc-cmake/CMakeLists.txt @@ -161,6 +161,9 @@ target_include_directories(_jemalloc SYSTEM PRIVATE target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_NO_PRIVATE_NAMESPACE) +# Because our coverage callbacks call malloc, and recursive call of malloc could not work. +target_compile_options(_jemalloc PRIVATE ${WITHOUT_COVERAGE_FLAGS_LIST}) + if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") target_compile_definitions(_jemalloc PRIVATE -DJEMALLOC_DEBUG=1 From 3d904cbf81eb6ce2472eabdcd0be5f6955984ce5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 08:09:08 +0100 Subject: [PATCH 066/884] Slightly better --- base/base/coverage.cpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index d70c3bcd82b..ac8055e836c 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -1,4 +1,5 @@ #include "coverage.h" +#include #pragma GCC diagnostic ignored "-Wreserved-identifier" @@ -57,6 +58,14 @@ namespace uintptr_t * all_addresses_array = nullptr; size_t all_addresses_array_size = 0; + + uintptr_t * allocate(size_t size) + { + void * map = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (MAP_FAILED == map) + return nullptr; + return static_cast(map); + } } extern "C" @@ -79,7 +88,7 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) coverage_array_size = stop - start; /// Note: we will leak this. - coverage_array = static_cast(malloc(sizeof(uintptr_t) * coverage_array_size)); + coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); resetCoverage(); } @@ -92,7 +101,7 @@ void __sanitizer_cov_pcs_init(const uintptr_t * pcs_begin, const uintptr_t * pcs return; pc_table_initialized = true; - all_addresses_array = static_cast(malloc(sizeof(uintptr_t) * coverage_array_size)); + all_addresses_array = allocate(sizeof(uintptr_t) * coverage_array_size); all_addresses_array_size = pcs_end - pcs_begin; /// They are not a real pointers, but also contain a flag in the most significant bit, From 33d9a1d4e83d58f15e36ea6e88908c8410f03c40 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 22:03:47 +0100 Subject: [PATCH 067/884] Documentation --- src/Functions/coverage.cpp | 48 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/src/Functions/coverage.cpp b/src/Functions/coverage.cpp index 8a62469fa54..86de047a76b 100644 --- a/src/Functions/coverage.cpp +++ b/src/Functions/coverage.cpp @@ -85,8 +85,52 @@ public: REGISTER_FUNCTION(Coverage) { - factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }); - factory.registerFunction("coverageAll", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::All)); }); + factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of unique addresses (a subset of the instrumented points in code) in the code +encountered at runtime after the previous coverage reset (with the `SYSTEM RESET COVERAGE` query) or after server startup. + +[example:functions] + +The order of array elements is undetermined. + +You can use another function, `coverageAll` to find all instrumented addresses in the code to compare and calculate the percentage. + +You can process the addresses with the `addressToSymbol` (possibly with `demangle`) and `addressToLine` functions +to calculate symbol-level, file-level, or line-level coverage. + +If you run multiple tests sequentially and reset the coverage with the `SYSTEM RESET COVERAGE` query between the tests, +you can obtain a coverage information for every test in isolation, to find which functions are covered by which tests and vise-versa. + +By default, every *basic block* in the code is covered, which roughly means - a sequence of instructions without jumps, +e.g. a body of for loop without ifs, or a single branch of if. + +See https://clang.llvm.org/docs/SanitizerCoverage.html for more information. +)", + .examples{ + {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverage())))", ""}}, + .categories{"Introspection"} + }); + + factory.registerFunction("coverageAll", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::All)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of all unique addresses in the code instrumented for coverage +- all possible addresses that can appear in the result of the `coverage` function. + +You can use this function, and the `coverage` function to compare and calculate the coverage percentage. + +See the `coverage` function for the details. +)", + .categories{"Introspection"} + }); } } From 3bd2c7e384d07d07da8768aa4708c7726b828db5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 14 Jan 2024 22:06:25 +0100 Subject: [PATCH 068/884] Report coverage if available --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index c7049b0e0c8..2d278f18176 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -2840,7 +2840,7 @@ def parse_args(): parser.add_argument( "--collect-per-test-coverage", action="store_true", - default=False, + default=True, help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", ) parser.add_argument( From 9141e1693f03f39d2eda37423918d2b2d873877a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:24:17 +0100 Subject: [PATCH 069/884] Calculate cumulative coverage by default. --- tests/clickhouse-test | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 2d278f18176..f1b20a3a43e 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1322,6 +1322,7 @@ class TestCase: # We want to calculate per-test code coverage. That's why we reset it before each test. if ( args.collect_per_test_coverage + and args.reset_coverage_before_every_test and BuildFlags.SANITIZE_COVERAGE in args.build_flags ): clickhouse_execute( @@ -2843,6 +2844,12 @@ def parse_args(): default=True, help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", ) + parser.add_argument( + "--reset-coverage-before-every-test", + action="store_true", + default=False, + help="Collect isolated test coverage for every test instead of a cumulative. Useful only when tests are run sequentially.", + ) parser.add_argument( "--report-logs-stats", action="store_true", From f7abeff0857ec231a7107d2a006b5f98b60a689f Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:25:07 +0100 Subject: [PATCH 070/884] Slightly better reporting --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index f1b20a3a43e..e480957e5f4 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1259,7 +1259,7 @@ class TestCase: retry_error_codes=True, ).decode() - description_full += f" Coverage: {coverage}" + description_full += f" (coverage: {coverage})" description_full += "\n" From 3e09feda336a355173b46ec85a9cd86d640f3348 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:36:07 +0100 Subject: [PATCH 071/884] More functions --- base/base/coverage.cpp | 21 +++++++++++++++------ base/base/coverage.h | 5 ++++- src/Functions/coverage.cpp | 33 ++++++++++++++++++++++++++++----- tests/clickhouse-test | 6 +++--- 4 files changed, 50 insertions(+), 15 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index ac8055e836c..499e384d21f 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -53,7 +53,8 @@ namespace uint32_t * guards_start = nullptr; uint32_t * guards_end = nullptr; - uintptr_t * coverage_array = nullptr; + uintptr_t * current_coverage_array = nullptr; + uintptr_t * cumulative_coverage_array = nullptr; size_t coverage_array_size = 0; uintptr_t * all_addresses_array = nullptr; @@ -88,7 +89,8 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) coverage_array_size = stop - start; /// Note: we will leak this. - coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); resetCoverage(); } @@ -126,15 +128,22 @@ void __sanitizer_cov_trace_pc_guard(uint32_t * guard) /// and use them to dereference an array or a bit vector. void * pc = __builtin_return_address(0); - coverage_array[guard - guards_start] = reinterpret_cast(pc); + current_coverage_array[guard - guards_start] = reinterpret_cast(pc); + cumulative_coverage_array[guard - guards_start] = reinterpret_cast(pc); } } __attribute__((no_sanitize("coverage"))) -std::span getCoverage() +std::span getCurrentCoverage() { - return {coverage_array, coverage_array_size}; + return {current_coverage_array, coverage_array_size}; +} + +__attribute__((no_sanitize("coverage"))) +std::span getCumulativeCoverage() +{ + return {cumulative_coverage_array, coverage_array_size}; } __attribute__((no_sanitize("coverage"))) @@ -146,7 +155,7 @@ std::span getAllInstrumentedAddresses() __attribute__((no_sanitize("coverage"))) void resetCoverage() { - memset(coverage_array, 0, coverage_array_size * sizeof(*coverage_array)); + memset(current_coverage_array, 0, coverage_array_size * sizeof(*current_coverage_array)); /// The guard defines whether the __sanitizer_cov_trace_pc_guard should be called. /// For example, you can unset it after first invocation to prevent excessive work. diff --git a/base/base/coverage.h b/base/base/coverage.h index f75ed2d3553..a6e5a6848d7 100644 --- a/base/base/coverage.h +++ b/base/base/coverage.h @@ -15,7 +15,10 @@ void dumpCoverageReportIfPossible(); /// Get accumulated unique program addresses of the instrumented parts of the code, /// seen so far after program startup or after previous reset. /// The returned span will be represented as a sparse map, containing mostly zeros, which you should filter away. -std::span getCoverage(); +std::span getCurrentCoverage(); + +/// Similar but not being reset. +std::span getCumulativeCoverage(); /// Get all instrumented addresses that could be in the coverage. std::span getAllInstrumentedAddresses(); diff --git a/src/Functions/coverage.cpp b/src/Functions/coverage.cpp index 86de047a76b..f4cac26df78 100644 --- a/src/Functions/coverage.cpp +++ b/src/Functions/coverage.cpp @@ -21,11 +21,14 @@ namespace enum class Kind { Current, + Cumulative, All }; /** If ClickHouse is build with coverage instrumentation, returns an array - * of currently accumulated (`coverage`) / all possible (`coverageAll`) unique code addresses. + * of currently accumulated (`coverageCurrent`) + * or accumulated since the startup (`coverageCumulative`) + * or all possible (`coverageAll`) unique code addresses. */ class FunctionCoverage : public IFunction { @@ -64,7 +67,11 @@ public: ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override { - auto coverage_table = kind == Kind::Current ? getCoverage() : getAllInstrumentedAddresses(); + auto coverage_table = kind == Kind::Current + ? getCurrentCoverage() + : (kind == Kind::Cumulative + ? getCumulativeCoverage() + : getAllInstrumentedAddresses()); auto column_addresses = ColumnUInt64::create(); auto & data = column_addresses->getData(); @@ -85,7 +92,7 @@ public: REGISTER_FUNCTION(Coverage) { - factory.registerFunction("coverage", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, + factory.registerFunction("coverageCurrent", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Current)); }, FunctionDocumentation { .description=R"( @@ -112,7 +119,23 @@ e.g. a body of for loop without ifs, or a single branch of if. See https://clang.llvm.org/docs/SanitizerCoverage.html for more information. )", .examples{ - {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverage())))", ""}}, + {"functions", "SELECT DISTINCT demangle(addressToSymbol(arrayJoin(coverageCurrent())))", ""}}, + .categories{"Introspection"} + }); + + factory.registerFunction("coverageCumulative", [](ContextPtr){ return std::make_unique(std::make_shared(Kind::Cumulative)); }, + FunctionDocumentation + { + .description=R"( +This function is only available if ClickHouse was built with the SANITIZE_COVERAGE=1 option. + +It returns an array of unique addresses (a subset of the instrumented points in code) in the code +encountered at runtime after server startup. + +In contrast to `coverageCurrent` it cannot be reset with the `SYSTEM RESET COVERAGE`. + +See the `coverageCurrent` function for the details. +)", .categories{"Introspection"} }); @@ -127,7 +150,7 @@ It returns an array of all unique addresses in the code instrumented for coverag You can use this function, and the `coverage` function to compare and calculate the coverage percentage. -See the `coverage` function for the details. +See the `coverageCurrent` function for the details. )", .categories{"Introspection"} }); diff --git a/tests/clickhouse-test b/tests/clickhouse-test index e480957e5f4..a39c90947ba 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1249,13 +1249,13 @@ class TestCase: ): clickhouse_execute( args, - f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverage()", + f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverageCurrent()", retry_error_codes=True, ) coverage = clickhouse_execute( args, - "SELECT length(coverage())", + "SELECT length(coverageCurrent())", retry_error_codes=True, ).decode() @@ -2460,7 +2460,7 @@ def main(args): # Coverage collected at the system startup before running any tests: clickhouse_execute( args, - "INSERT INTO system.coverage SELECT now(), '', coverage()", + "INSERT INTO system.coverage SELECT now(), '', coverageCurrent()", ) total_tests_run = 0 From e4cd02ea39642dd9b8d519aee0426b752423c3bf Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:36:24 +0100 Subject: [PATCH 072/884] Fix typo --- src/IO/OpenedFile.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/IO/OpenedFile.h b/src/IO/OpenedFile.h index 10c36d9e1d3..4c4de2265bc 100644 --- a/src/IO/OpenedFile.h +++ b/src/IO/OpenedFile.h @@ -21,7 +21,7 @@ public: OpenedFile(const std::string & file_name_, int flags_); ~OpenedFile(); - /// Close prematurally. + /// Close prematurely. void close(); int getFD() const; @@ -40,4 +40,3 @@ private: }; } - From 30c362909089d6f7fe93b639dfdf1666d5bcfc7c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:36:44 +0100 Subject: [PATCH 073/884] An option to dump coverage to a file at exit --- programs/main.cpp | 47 ++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 46 insertions(+), 1 deletion(-) diff --git a/programs/main.cpp b/programs/main.cpp index 7d07112de66..4852ed8990e 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include @@ -19,6 +20,7 @@ #include #include +#include /// Universal executable for various clickhouse applications @@ -512,6 +514,49 @@ int main(int argc_, char ** argv_) if (main_func == printHelp && !argv.empty() && (argv.size() == 1 || argv[1][0] == '-')) main_func = mainEntryClickHouseLocal; - return main_func(static_cast(argv.size()), argv.data()); + int exit_code = main_func(static_cast(argv.size()), argv.data()); + +#if defined(SANITIZE_COVERAGE) + /// A user can request to dump the coverage information into files at exit. + /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, + /// that cannot introspect it with SQL functions at runtime. + + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for two filenames: + /// 'prefix.covered' and 'prefix.all' which will contain + /// the list of addresses of covered and all instrumented addresses, respectively. + + /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. + + if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) + { + auto dumpCoverage = [](const std::string & name, auto span) + { + /// Write only non-zeros. + std::vector data; + data.reserve(span.size()); + for (auto addr : span) + if (addr) + data.push_back(addr); + + int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); + if (-1 == fd) + { + writeError("Cannot open a file to write the coverage data\n"); + } + else + { + if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) + writeError("Cannot write the coverage data to a file\n"); + if (0 != ::close(fd)) + writeError("Cannot close the file with coverage data\n"); + } + }; + + dumpCoverage(coverage_filename_prefix + std::string(".covered"), getCumulativeCoverage()); + dumpCoverage(coverage_filename_prefix + std::string(".all"), getAllInstrumentedAddresses()); + } +#endif + + return exit_code; } #endif From fe952fb64c460c260c77336142b5eb4bd05b46d8 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:38:42 +0100 Subject: [PATCH 074/884] Rename to system.coverage_log to simplify export --- tests/clickhouse-test | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index a39c90947ba..eb85bdff0f5 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1249,7 +1249,7 @@ class TestCase: ): clickhouse_execute( args, - f"INSERT INTO system.coverage SELECT now(), '{self.case}', coverageCurrent()", + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', coverageCurrent()", retry_error_codes=True, ) @@ -2448,7 +2448,7 @@ def main(args): clickhouse_execute( args, """ - CREATE TABLE IF NOT EXISTS system.coverage + CREATE TABLE IF NOT EXISTS system.coverage_log ( time DateTime, test_name String, @@ -2460,7 +2460,7 @@ def main(args): # Coverage collected at the system startup before running any tests: clickhouse_execute( args, - "INSERT INTO system.coverage SELECT now(), '', coverageCurrent()", + "INSERT INTO system.coverage_log SELECT now(), '', coverageCurrent()", ) total_tests_run = 0 @@ -2842,7 +2842,7 @@ def parse_args(): "--collect-per-test-coverage", action="store_true", default=True, - help="Create `system.coverage` table on the server and collect information about low-level code coverage on a per test basis there", + help="Create `system.coverage_log` table on the server and collect information about low-level code coverage on a per test basis there", ) parser.add_argument( "--reset-coverage-before-every-test", From 7662628393f97dd1c094b3346cc55c71f10ad193 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:41:14 +0100 Subject: [PATCH 075/884] Export coverage to the CI database --- docker/test/base/setup_export_logs.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index ea82e071112..659bf29b057 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -129,6 +129,19 @@ function setup_logs_replication debug_or_sanitizer_build=$(clickhouse-client -q "WITH ((SELECT value FROM system.build_options WHERE name='BUILD_TYPE') AS build, (SELECT value FROM system.build_options WHERE name='CXX_FLAGS') as flags) SELECT build='Debug' OR flags LIKE '%fsanitize%'") echo "Build is debug or sanitizer: $debug_or_sanitizer_build" + # We will pre-create a table system.coverage_log. + # It is normally created by clickhouse-test rather than the server, + # so we will create it in advance to make it be picked up by the next commands: + + clickhouse-client --query " + CREATE TABLE IF NOT EXISTS system.coverage_log + ( + time DateTime, + test_name String, + coverage Array(UInt64) + ) ENGINE = MergeTree ORDER BY test_name + " + # For each system log table: echo 'Create %_log tables' clickhouse-client --query "SHOW TABLES FROM system LIKE '%\\_log'" | while read -r table From 97200e2c5d65693ad5d1e6a7c7dea3d5cac0e23d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:46:01 +0100 Subject: [PATCH 076/884] Symbolization --- docker/test/base/setup_export_logs.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 659bf29b057..e141bc00a77 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -138,7 +138,8 @@ function setup_logs_replication ( time DateTime, test_name String, - coverage Array(UInt64) + coverage Array(UInt64), + symbols Array(LowCardinality(String)) MATERIALIZED arrayMap(x -> demangle(addressToSymbol(x)), coverage) ) ENGINE = MergeTree ORDER BY test_name " From bf2e5748575ad2eb74eb057e0ee242a149edecdb Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 00:48:47 +0100 Subject: [PATCH 077/884] Symbolization --- docker/test/base/setup_export_logs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index e141bc00a77..20dd864318f 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -188,7 +188,7 @@ function setup_logs_replication echo "Creating table system.${table}_sender" >&2 # Create Distributed table and materialized view to watch on the original table: - clickhouse-client --query " + clickhouse-client --asterisk_include_materialized_columns 1 --query " CREATE TABLE system.${table}_sender ENGINE = Distributed(${CLICKHOUSE_CI_LOGS_CLUSTER}, default, ${table}_${hash}) SETTINGS flush_on_detach=0 @@ -199,7 +199,7 @@ function setup_logs_replication echo "Creating materialized view system.${table}_watcher" >&2 - clickhouse-client --query " + clickhouse-client --asterisk_include_materialized_columns 1 --query " CREATE MATERIALIZED VIEW system.${table}_watcher TO system.${table}_sender AS SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, * FROM system.${table} From c5dfae1bcade85289b78f0bb760c92bcee078743 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 05:07:23 +0100 Subject: [PATCH 078/884] Fix error --- docker/test/base/setup_export_logs.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 20dd864318f..26fcd10d666 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -23,6 +23,10 @@ EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "} EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), " EXTRA_COLUMNS_EXPRESSION_TRACE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), trace)::Array(LowCardinality(String)) AS symbols, arrayMap(x -> addressToLine(x), trace)::Array(LowCardinality(String)) AS lines" +# coverage_log needs more columns for symbolization, but only symbol names (the line numbers are too heavy to calculate) +EXTRA_COLUMNS_COVERAGE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), " +EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x -> demangle(addressToSymbol(x)), coverage)::Array(LowCardinality(String)) AS symbols" + function __set_connection_args { @@ -138,8 +142,7 @@ function setup_logs_replication ( time DateTime, test_name String, - coverage Array(UInt64), - symbols Array(LowCardinality(String)) MATERIALIZED arrayMap(x -> demangle(addressToSymbol(x)), coverage) + coverage Array(UInt64) ) ENGINE = MergeTree ORDER BY test_name " @@ -158,7 +161,10 @@ function setup_logs_replication else EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_TRACE_LOG}" fi - else + elif [[ "$table" = "coverage_log" ]] + EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS_COVERAGE_LOG}" + EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG}" + then EXTRA_COLUMNS_FOR_TABLE="${EXTRA_COLUMNS}" EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" fi From e13ca48bce836a2534047e59a4e922395a8f6a87 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 05:13:43 +0100 Subject: [PATCH 079/884] Better dump on exit --- programs/main.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/programs/main.cpp b/programs/main.cpp index 4852ed8990e..8958d84e243 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -521,9 +521,8 @@ int main(int argc_, char ** argv_) /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, /// that cannot introspect it with SQL functions at runtime. - /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for two filenames: - /// 'prefix.covered' and 'prefix.all' which will contain - /// the list of addresses of covered and all instrumented addresses, respectively. + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' + /// containing the list of addresses of covered . /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. @@ -552,8 +551,7 @@ int main(int argc_, char ** argv_) } }; - dumpCoverage(coverage_filename_prefix + std::string(".covered"), getCumulativeCoverage()); - dumpCoverage(coverage_filename_prefix + std::string(".all"), getAllInstrumentedAddresses()); + dumpCoverage(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); } #endif From e49cfbef089499a457c8793724629e2e94c8dc37 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 05:40:03 +0100 Subject: [PATCH 080/884] Coverage for non-server tools --- tests/clickhouse-test | 23 +++++++++++++++++++++++ tests/queries/shell_config.sh | 4 ++++ 2 files changed, 27 insertions(+) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index eb85bdff0f5..bd796dbfdf2 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -12,6 +12,7 @@ import itertools import sys import os import os.path +import glob import platform import signal import re @@ -74,6 +75,10 @@ def stringhash(s): # only during process invocation https://stackoverflow.com/a/42089311 return zlib.crc32(s.encode("utf-8")) +def read_file_as_binary_string(file_path): + with open(file_path, 'rb') as file: + binary_data = file.read() + return binary_data # First and last lines of the log def trim_for_log(s): @@ -101,6 +106,7 @@ class HTTPError(Exception): def clickhouse_execute_http( base_args, query, + body=None, timeout=30, settings=None, default_format=None, @@ -140,6 +146,7 @@ def clickhouse_execute_http( client.request( "POST", f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}", + body=body ) res = client.getresponse() data = res.read() @@ -160,6 +167,7 @@ def clickhouse_execute_http( def clickhouse_execute( base_args, query, + body=None, timeout=30, settings=None, max_http_retries=5, @@ -168,6 +176,7 @@ def clickhouse_execute( return clickhouse_execute_http( base_args, query, + body, timeout, settings, max_http_retries=max_http_retries, @@ -181,6 +190,7 @@ def clickhouse_execute_json( data = clickhouse_execute_http( base_args, query, + None, timeout, settings, "JSONEachRow", @@ -1253,6 +1263,19 @@ class TestCase: retry_error_codes=True, ) + # Check for dumped coverage files + file_pattern = "coverage.*" + matching_files = glob.glob(file_pattern) + for file_path in matching_files: + body = read_file_as_binary_string(file_path) + clickhouse_execute( + args, + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", + body=body, + retry_error_codes=True, + ) + os.remove(file_path) + coverage = clickhouse_execute( args, "SELECT length(coverageCurrent())", diff --git a/tests/queries/shell_config.sh b/tests/queries/shell_config.sh index c687a63623f..614bfcece8f 100644 --- a/tests/queries/shell_config.sh +++ b/tests/queries/shell_config.sh @@ -4,6 +4,10 @@ # Don't check for ODR violation, since we may test shared build with ASAN export ASAN_OPTIONS=detect_odr_violation=0 +# If ClickHouse was built with coverage - dump the coverage information at exit +# (in other cases this environment variable has no effect) +export CLICKHOUSE_WRITE_COVERAGE="coverage" + export CLICKHOUSE_DATABASE=${CLICKHOUSE_DATABASE:="test"} export CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=${CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL:="warning"} From 678a32cedee768b6c1a6748e96a0d103e853d8bc Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:12:58 +0100 Subject: [PATCH 081/884] Obey Python's quirky formatter --- tests/integration/ci-runner.py | 13 +- .../test_async_insert_memory/test.py | 2 +- tests/integration/test_check_table/test.py | 76 +++-- .../test_cluster_discovery/test.py | 2 +- .../test_ldap_external_user_directory/test.py | 26 +- tests/integration/test_mysql_protocol/test.py | 16 +- tests/integration/test_partition/test.py | 4 +- .../test_replicated_database/test.py | 9 +- .../test.py | 9 +- .../s3_mocks/unstable_server.py | 2 +- tests/integration/test_storage_s3/test.py | 17 +- tests/integration/test_storage_url/test.py | 22 +- tests/integration/test_system_merges/test.py | 45 ++- utils/grpc-client/pb2/clickhouse_grpc_pb2.py | 271 ++++++++++-------- .../pb2/clickhouse_grpc_pb2_grpc.py | 237 +++++++++------ 15 files changed, 433 insertions(+), 318 deletions(-) diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index 7c922e339fe..d54ed2bb767 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -305,14 +305,11 @@ class ClickhouseIntegrationTestsRunner: def _pre_pull_images(self, repo_path): image_cmd = self._get_runner_image_cmd(repo_path) - cmd = ( - "cd {repo_path}/tests/integration && " - "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( - repo_path=repo_path, - runner_opts=self._get_runner_opts(), - image_cmd=image_cmd, - command=r""" echo Pre Pull finished """, - ) + cmd = "cd {repo_path}/tests/integration && " "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( + repo_path=repo_path, + runner_opts=self._get_runner_opts(), + image_cmd=image_cmd, + command=r""" echo Pre Pull finished """, ) for i in range(5): diff --git a/tests/integration/test_async_insert_memory/test.py b/tests/integration/test_async_insert_memory/test.py index 5d2e5503680..f897007f7bb 100644 --- a/tests/integration/test_async_insert_memory/test.py +++ b/tests/integration/test_async_insert_memory/test.py @@ -43,7 +43,7 @@ def test_memory_usage(): response = node.get_query_request( "SELECT groupArray(number) FROM numbers(1000000) SETTINGS max_memory_usage_for_user={}".format( - 30 * (2**23) + 30 * (2 ** 23) ), user="A", ) diff --git a/tests/integration/test_check_table/test.py b/tests/integration/test_check_table/test.py index 021977fb6b6..ebf404e698b 100644 --- a/tests/integration/test_check_table/test.py +++ b/tests/integration/test_check_table/test.py @@ -95,15 +95,25 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): node1, "non_replicated_mt", "201902_1_1_0", database="default" ) - assert node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] + assert ( + node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + .strip() + .split("\t")[0:2] + == ["201902_1_1_0", "0"] + ) - assert node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] + assert ( + node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + .strip() + .split("\t")[0:2] + == ["201902_1_1_0", "0"] + ) node1.query( "INSERT INTO non_replicated_mt VALUES (toDate('2019-01-01'), 1, 10), (toDate('2019-01-01'), 2, 12)" @@ -123,10 +133,15 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): remove_checksums_on_disk(node1, "default", "non_replicated_mt", "201901_2_2_0") - assert node1.query( - "CHECK TABLE non_replicated_mt PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ).strip().split("\t")[0:2] == ["201901_2_2_0", "0"] + assert ( + node1.query( + "CHECK TABLE non_replicated_mt PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + .strip() + .split("\t")[0:2] + == ["201901_2_2_0", "0"] + ) @pytest.mark.parametrize("merge_tree_settings, zk_path_suffix", [("", "_0")]) @@ -194,12 +209,15 @@ def test_check_replicated_table_simple( == "201901_0_0_0\t1\t\n" ) - assert sorted( - node2.query( - "CHECK TABLE replicated_mt", - settings={"check_query_single_value_result": 0}, - ).split("\n") - ) == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] + assert ( + sorted( + node2.query( + "CHECK TABLE replicated_mt", + settings={"check_query_single_value_result": 0}, + ).split("\n") + ) + == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] + ) with pytest.raises(QueryRuntimeException) as exc: node2.query( @@ -273,10 +291,13 @@ def test_check_replicated_table_corruption( ) node1.query_with_retry("SYSTEM SYNC REPLICA replicated_mt_1") - assert node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) == "{}\t1\t\n".format(part_name) + assert ( + node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + == "{}\t1\t\n".format(part_name) + ) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" remove_part_from_disk(node2, "replicated_mt_1", part_name) @@ -288,10 +309,13 @@ def test_check_replicated_table_corruption( ) node1.query("SYSTEM SYNC REPLICA replicated_mt_1") - assert node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) == "{}\t1\t\n".format(part_name) + assert ( + node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) + == "{}\t1\t\n".format(part_name) + ) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" diff --git a/tests/integration/test_cluster_discovery/test.py b/tests/integration/test_cluster_discovery/test.py index ad3deb5b142..a2e7e15b956 100644 --- a/tests/integration/test_cluster_discovery/test.py +++ b/tests/integration/test_cluster_discovery/test.py @@ -61,7 +61,7 @@ def check_on_cluster( print(f"Retry {retry}/{retries} unsuccessful, result: {node_results}") if retry != retries: - time.sleep(2**retry) + time.sleep(2 ** retry) else: msg = msg or f"Wrong '{what}' result" raise Exception( diff --git a/tests/integration/test_ldap_external_user_directory/test.py b/tests/integration/test_ldap_external_user_directory/test.py index 39753794d63..c9642c293ee 100644 --- a/tests/integration/test_ldap_external_user_directory/test.py +++ b/tests/integration/test_ldap_external_user_directory/test.py @@ -76,11 +76,14 @@ def test_role_mapping(ldap_cluster): "select currentUser()", user="johndoe", password="qwertz" ) == TSV([["johndoe"]]) - assert instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) == TSV([["role_1"], ["role_2"]]) + assert ( + instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) + == TSV([["role_1"], ["role_2"]]) + ) instance.query("CREATE ROLE role_3") add_ldap_group(ldap_cluster, group_cn="clickhouse-role_3", member_cn="johndoe") @@ -88,8 +91,11 @@ def test_role_mapping(ldap_cluster): # See https://github.com/ClickHouse/ClickHouse/issues/54318 add_ldap_group(ldap_cluster, group_cn="clickhouse-role_4", member_cn="johndoe") - assert instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) == TSV([["role_1"], ["role_2"], ["role_3"]]) + assert ( + instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) + == TSV([["role_1"], ["role_2"], ["role_3"]]) + ) diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 7a69d07633c..61e76c0dc97 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -854,14 +854,14 @@ def test_types(started_cluster): result = cursor.fetchall()[0] expected = [ - ("Int8_column", -(2**7)), - ("UInt8_column", 2**8 - 1), - ("Int16_column", -(2**15)), - ("UInt16_column", 2**16 - 1), - ("Int32_column", -(2**31)), - ("UInt32_column", 2**32 - 1), - ("Int64_column", -(2**63)), - ("UInt64_column", 2**64 - 1), + ("Int8_column", -(2 ** 7)), + ("UInt8_column", 2 ** 8 - 1), + ("Int16_column", -(2 ** 15)), + ("UInt16_column", 2 ** 16 - 1), + ("Int32_column", -(2 ** 31)), + ("UInt32_column", 2 ** 32 - 1), + ("Int64_column", -(2 ** 63)), + ("UInt64_column", 2 ** 64 - 1), ("String_column", "тест"), ("FixedString_column", "тест"), ("Float32_column", 1.5), diff --git a/tests/integration/test_partition/test.py b/tests/integration/test_partition/test.py index 054418a8ba9..d39787f8924 100644 --- a/tests/integration/test_partition/test.py +++ b/tests/integration/test_partition/test.py @@ -561,9 +561,7 @@ def test_make_clone_in_detached(started_cluster): ["cp", "-r", path + "all_0_0_0", path + "detached/broken_all_0_0_0"] ) assert_eq_with_retry(instance, "select * from clone_in_detached", "\n") - assert [ - "broken_all_0_0_0", - ] == sorted( + assert ["broken_all_0_0_0",] == sorted( instance.exec_in_container(["ls", path + "detached/"]).strip().split("\n") ) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 1fc3fe37044..16425c9bd9e 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -506,12 +506,9 @@ def test_alters_from_different_replicas(started_cluster): dummy_node.stop_clickhouse(kill=True) settings = {"distributed_ddl_task_timeout": 5} - assert ( - "There are 1 unfinished hosts (0 of them are currently executing the task" - in competing_node.query_and_get_error( - "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", - settings=settings, - ) + assert "There are 1 unfinished hosts (0 of them are currently executing the task" in competing_node.query_and_get_error( + "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", + settings=settings, ) settings = { "distributed_ddl_task_timeout": 5, diff --git a/tests/integration/test_replicated_database_cluster_groups/test.py b/tests/integration/test_replicated_database_cluster_groups/test.py index 647626d8014..5a315707efb 100644 --- a/tests/integration/test_replicated_database_cluster_groups/test.py +++ b/tests/integration/test_replicated_database_cluster_groups/test.py @@ -95,12 +95,9 @@ def test_cluster_groups(started_cluster): # Exception main_node_2.stop_clickhouse() settings = {"distributed_ddl_task_timeout": 5} - assert ( - "There are 1 unfinished hosts (0 of them are currently executing the task)" - in main_node_1.query_and_get_error( - "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", - settings=settings, - ) + assert "There are 1 unfinished hosts (0 of them are currently executing the task)" in main_node_1.query_and_get_error( + "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", + settings=settings, ) # 3. After start both groups are synced diff --git a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py index 5ef781bdc9e..3632fa15d8a 100644 --- a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py +++ b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py @@ -9,7 +9,7 @@ import time def gen_n_digit_number(n): assert 0 < n < 19 - return random.randint(10 ** (n - 1), 10**n - 1) + return random.randint(10 ** (n - 1), 10 ** n - 1) sum_in_4_column = 0 diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 2549cb0d473..e941356261a 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -553,16 +553,13 @@ def test_multipart(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) # select uploaded data from many threads - select_query = ( - "select sum(column1), sum(column2), sum(column3) " - "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( - host=started_cluster.minio_redirect_host, - port=started_cluster.minio_redirect_port, - bucket=bucket, - filename=filename, - auth=maybe_auth, - table_format=table_format, - ) + select_query = "select sum(column1), sum(column2), sum(column3) " "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, ) try: select_result = run_query( diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py index 7ff7a871413..771df49cbac 100644 --- a/tests/integration/test_storage_url/test.py +++ b/tests/integration/test_storage_url/test.py @@ -79,15 +79,21 @@ def test_table_function_url_access_rights(): f"SELECT * FROM url('http://nginx:80/test_1', 'TSV')", user="u1" ) - assert node1.query( - f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + assert ( + node1.query( + f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) + == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + ) - assert node1.query( - f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + assert ( + node1.query( + f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) + == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) + ) expected_error = "necessary to have the grant URL ON *.*" assert expected_error in node1.query_and_get_error( diff --git a/tests/integration/test_system_merges/test.py b/tests/integration/test_system_merges/test.py index 6dbe6c891f2..bacb0eb500d 100644 --- a/tests/integration/test_system_merges/test.py +++ b/tests/integration/test_system_merges/test.py @@ -204,36 +204,33 @@ def test_mutation_simple(started_cluster, replicated): sleep_time=0.1, ) - assert ( - split_tsv( - node_check.query( - """ + assert split_tsv( + node_check.query( + """ SELECT database, table, num_parts, source_part_names, source_part_paths, result_part_name, result_part_path, partition_id, is_mutation FROM system.merges WHERE table = '{name}' """.format( - name=table_name - ) + name=table_name ) ) - == [ - [ - db_name, - table_name, - "1", - "['{}']".format(part), - "['{clickhouse}/{table_path}/{}/']".format( - part, clickhouse=clickhouse_path, table_path=table_path - ), - result_part, - "{clickhouse}/{table_path}/{}/".format( - result_part, clickhouse=clickhouse_path, table_path=table_path - ), - "all", - "1", - ], - ] - ) + ) == [ + [ + db_name, + table_name, + "1", + "['{}']".format(part), + "['{clickhouse}/{table_path}/{}/']".format( + part, clickhouse=clickhouse_path, table_path=table_path + ), + result_part, + "{clickhouse}/{table_path}/{}/".format( + result_part, clickhouse=clickhouse_path, table_path=table_path + ), + "all", + "1", + ], + ] t.join() assert ( diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py index 6218047af3c..9bf7817c7d3 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py @@ -8,16 +8,17 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database + # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( + b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x30\x01\x62\x06proto3' +) - -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc\")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel\"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03\"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03\"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12\"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t\"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04\"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04\"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t\"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x30\x01\x62\x06proto3') - -_LOGSLEVEL = DESCRIPTOR.enum_types_by_name['LogsLevel'] +_LOGSLEVEL = DESCRIPTOR.enum_types_by_name["LogsLevel"] LogsLevel = enum_type_wrapper.EnumTypeWrapper(_LOGSLEVEL) LOG_NONE = 0 LOG_FATAL = 1 @@ -30,134 +31,180 @@ LOG_DEBUG = 7 LOG_TRACE = 8 -_NAMEANDTYPE = DESCRIPTOR.message_types_by_name['NameAndType'] -_EXTERNALTABLE = DESCRIPTOR.message_types_by_name['ExternalTable'] -_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name['SettingsEntry'] -_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name['ObsoleteTransportCompression'] -_QUERYINFO = DESCRIPTOR.message_types_by_name['QueryInfo'] -_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name['SettingsEntry'] -_LOGENTRY = DESCRIPTOR.message_types_by_name['LogEntry'] -_PROGRESS = DESCRIPTOR.message_types_by_name['Progress'] -_STATS = DESCRIPTOR.message_types_by_name['Stats'] -_EXCEPTION = DESCRIPTOR.message_types_by_name['Exception'] -_RESULT = DESCRIPTOR.message_types_by_name['Result'] -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionAlgorithm'] -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionLevel'] -NameAndType = _reflection.GeneratedProtocolMessageType('NameAndType', (_message.Message,), { - 'DESCRIPTOR' : _NAMEANDTYPE, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) - }) +_NAMEANDTYPE = DESCRIPTOR.message_types_by_name["NameAndType"] +_EXTERNALTABLE = DESCRIPTOR.message_types_by_name["ExternalTable"] +_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name["SettingsEntry"] +_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name[ + "ObsoleteTransportCompression" +] +_QUERYINFO = DESCRIPTOR.message_types_by_name["QueryInfo"] +_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name["SettingsEntry"] +_LOGENTRY = DESCRIPTOR.message_types_by_name["LogEntry"] +_PROGRESS = DESCRIPTOR.message_types_by_name["Progress"] +_STATS = DESCRIPTOR.message_types_by_name["Stats"] +_EXCEPTION = DESCRIPTOR.message_types_by_name["Exception"] +_RESULT = DESCRIPTOR.message_types_by_name["Result"] +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = ( + _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionAlgorithm"] +) +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = ( + _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionLevel"] +) +NameAndType = _reflection.GeneratedProtocolMessageType( + "NameAndType", + (_message.Message,), + { + "DESCRIPTOR": _NAMEANDTYPE, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) + }, +) _sym_db.RegisterMessage(NameAndType) -ExternalTable = _reflection.GeneratedProtocolMessageType('ExternalTable', (_message.Message,), { - - 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { - 'DESCRIPTOR' : _EXTERNALTABLE_SETTINGSENTRY, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) - }) - , - 'DESCRIPTOR' : _EXTERNALTABLE, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) - }) +ExternalTable = _reflection.GeneratedProtocolMessageType( + "ExternalTable", + (_message.Message,), + { + "SettingsEntry": _reflection.GeneratedProtocolMessageType( + "SettingsEntry", + (_message.Message,), + { + "DESCRIPTOR": _EXTERNALTABLE_SETTINGSENTRY, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) + }, + ), + "DESCRIPTOR": _EXTERNALTABLE, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) + }, +) _sym_db.RegisterMessage(ExternalTable) _sym_db.RegisterMessage(ExternalTable.SettingsEntry) -ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType('ObsoleteTransportCompression', (_message.Message,), { - 'DESCRIPTOR' : _OBSOLETETRANSPORTCOMPRESSION, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) - }) +ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType( + "ObsoleteTransportCompression", + (_message.Message,), + { + "DESCRIPTOR": _OBSOLETETRANSPORTCOMPRESSION, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) + }, +) _sym_db.RegisterMessage(ObsoleteTransportCompression) -QueryInfo = _reflection.GeneratedProtocolMessageType('QueryInfo', (_message.Message,), { - - 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { - 'DESCRIPTOR' : _QUERYINFO_SETTINGSENTRY, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) - }) - , - 'DESCRIPTOR' : _QUERYINFO, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) - }) +QueryInfo = _reflection.GeneratedProtocolMessageType( + "QueryInfo", + (_message.Message,), + { + "SettingsEntry": _reflection.GeneratedProtocolMessageType( + "SettingsEntry", + (_message.Message,), + { + "DESCRIPTOR": _QUERYINFO_SETTINGSENTRY, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) + }, + ), + "DESCRIPTOR": _QUERYINFO, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) + }, +) _sym_db.RegisterMessage(QueryInfo) _sym_db.RegisterMessage(QueryInfo.SettingsEntry) -LogEntry = _reflection.GeneratedProtocolMessageType('LogEntry', (_message.Message,), { - 'DESCRIPTOR' : _LOGENTRY, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) - }) +LogEntry = _reflection.GeneratedProtocolMessageType( + "LogEntry", + (_message.Message,), + { + "DESCRIPTOR": _LOGENTRY, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) + }, +) _sym_db.RegisterMessage(LogEntry) -Progress = _reflection.GeneratedProtocolMessageType('Progress', (_message.Message,), { - 'DESCRIPTOR' : _PROGRESS, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) - }) +Progress = _reflection.GeneratedProtocolMessageType( + "Progress", + (_message.Message,), + { + "DESCRIPTOR": _PROGRESS, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) + }, +) _sym_db.RegisterMessage(Progress) -Stats = _reflection.GeneratedProtocolMessageType('Stats', (_message.Message,), { - 'DESCRIPTOR' : _STATS, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) - }) +Stats = _reflection.GeneratedProtocolMessageType( + "Stats", + (_message.Message,), + { + "DESCRIPTOR": _STATS, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) + }, +) _sym_db.RegisterMessage(Stats) -Exception = _reflection.GeneratedProtocolMessageType('Exception', (_message.Message,), { - 'DESCRIPTOR' : _EXCEPTION, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) - }) +Exception = _reflection.GeneratedProtocolMessageType( + "Exception", + (_message.Message,), + { + "DESCRIPTOR": _EXCEPTION, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) + }, +) _sym_db.RegisterMessage(Exception) -Result = _reflection.GeneratedProtocolMessageType('Result', (_message.Message,), { - 'DESCRIPTOR' : _RESULT, - '__module__' : 'clickhouse_grpc_pb2' - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) - }) +Result = _reflection.GeneratedProtocolMessageType( + "Result", + (_message.Message,), + { + "DESCRIPTOR": _RESULT, + "__module__": "clickhouse_grpc_pb2" + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) + }, +) _sym_db.RegisterMessage(Result) -_CLICKHOUSE = DESCRIPTOR.services_by_name['ClickHouse'] +_CLICKHOUSE = DESCRIPTOR.services_by_name["ClickHouse"] if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _EXTERNALTABLE_SETTINGSENTRY._options = None - _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b'8\001' - _QUERYINFO_SETTINGSENTRY._options = None - _QUERYINFO_SETTINGSENTRY._serialized_options = b'8\001' - _LOGSLEVEL._serialized_start=2363 - _LOGSLEVEL._serialized_end=2520 - _NAMEANDTYPE._serialized_start=42 - _NAMEANDTYPE._serialized_end=83 - _EXTERNALTABLE._serialized_start=86 - _EXTERNALTABLE._serialized_end=331 - _EXTERNALTABLE_SETTINGSENTRY._serialized_start=284 - _EXTERNALTABLE_SETTINGSENTRY._serialized_end=331 - _OBSOLETETRANSPORTCOMPRESSION._serialized_start=334 - _OBSOLETETRANSPORTCOMPRESSION._serialized_end=723 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start=532 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end=614 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start=616 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end=723 - _QUERYINFO._serialized_start=726 - _QUERYINFO._serialized_end=1508 - _QUERYINFO_SETTINGSENTRY._serialized_start=284 - _QUERYINFO_SETTINGSENTRY._serialized_end=331 - _LOGENTRY._serialized_start=1511 - _LOGENTRY._serialized_end=1672 - _PROGRESS._serialized_start=1674 - _PROGRESS._serialized_end=1796 - _STATS._serialized_start=1798 - _STATS._serialized_end=1910 - _EXCEPTION._serialized_start=1912 - _EXCEPTION._serialized_end=1994 - _RESULT._serialized_start=1997 - _RESULT._serialized_end=2360 - _CLICKHOUSE._serialized_start=2523 - _CLICKHOUSE._serialized_end=2870 + DESCRIPTOR._options = None + _EXTERNALTABLE_SETTINGSENTRY._options = None + _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b"8\001" + _QUERYINFO_SETTINGSENTRY._options = None + _QUERYINFO_SETTINGSENTRY._serialized_options = b"8\001" + _LOGSLEVEL._serialized_start = 2363 + _LOGSLEVEL._serialized_end = 2520 + _NAMEANDTYPE._serialized_start = 42 + _NAMEANDTYPE._serialized_end = 83 + _EXTERNALTABLE._serialized_start = 86 + _EXTERNALTABLE._serialized_end = 331 + _EXTERNALTABLE_SETTINGSENTRY._serialized_start = 284 + _EXTERNALTABLE_SETTINGSENTRY._serialized_end = 331 + _OBSOLETETRANSPORTCOMPRESSION._serialized_start = 334 + _OBSOLETETRANSPORTCOMPRESSION._serialized_end = 723 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start = 532 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end = 614 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start = 616 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end = 723 + _QUERYINFO._serialized_start = 726 + _QUERYINFO._serialized_end = 1508 + _QUERYINFO_SETTINGSENTRY._serialized_start = 284 + _QUERYINFO_SETTINGSENTRY._serialized_end = 331 + _LOGENTRY._serialized_start = 1511 + _LOGENTRY._serialized_end = 1672 + _PROGRESS._serialized_start = 1674 + _PROGRESS._serialized_end = 1796 + _STATS._serialized_start = 1798 + _STATS._serialized_end = 1910 + _EXCEPTION._serialized_start = 1912 + _EXCEPTION._serialized_end = 1994 + _RESULT._serialized_start = 1997 + _RESULT._serialized_end = 2360 + _CLICKHOUSE._serialized_start = 2523 + _CLICKHOUSE._serialized_end = 2870 # @@protoc_insertion_point(module_scope) diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py index 1c71218bbe5..25643a243b3 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py @@ -15,25 +15,25 @@ class ClickHouseStub(object): channel: A grpc.Channel. """ self.ExecuteQuery = channel.unary_unary( - '/clickhouse.grpc.ClickHouse/ExecuteQuery', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQuery", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamInput = channel.stream_unary( - '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamOutput = channel.unary_stream( - '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamIO = channel.stream_stream( - '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) class ClickHouseServicer(object): @@ -42,124 +42,173 @@ class ClickHouseServicer(object): def ExecuteQuery(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def ExecuteQueryWithStreamInput(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def ExecuteQueryWithStreamOutput(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def ExecuteQueryWithStreamIO(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details('Method not implemented!') - raise NotImplementedError('Method not implemented!') + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") def add_ClickHouseServicer_to_server(servicer, server): rpc_method_handlers = { - 'ExecuteQuery': grpc.unary_unary_rpc_method_handler( - servicer.ExecuteQuery, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - 'ExecuteQueryWithStreamInput': grpc.stream_unary_rpc_method_handler( - servicer.ExecuteQueryWithStreamInput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - 'ExecuteQueryWithStreamOutput': grpc.unary_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamOutput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - 'ExecuteQueryWithStreamIO': grpc.stream_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamIO, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), + "ExecuteQuery": grpc.unary_unary_rpc_method_handler( + servicer.ExecuteQuery, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + "ExecuteQueryWithStreamInput": grpc.stream_unary_rpc_method_handler( + servicer.ExecuteQueryWithStreamInput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + "ExecuteQueryWithStreamOutput": grpc.unary_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamOutput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + "ExecuteQueryWithStreamIO": grpc.stream_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamIO, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - 'clickhouse.grpc.ClickHouse', rpc_method_handlers) + "clickhouse.grpc.ClickHouse", rpc_method_handlers + ) server.add_generic_rpc_handlers((generic_handler,)) - # This class is part of an EXPERIMENTAL API. +# This class is part of an EXPERIMENTAL API. class ClickHouse(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ExecuteQuery(request, + def ExecuteQuery( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_unary(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQuery', + "/clickhouse.grpc.ClickHouse/ExecuteQuery", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def ExecuteQueryWithStreamInput(request_iterator, + def ExecuteQueryWithStreamInput( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_unary( + request_iterator, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.stream_unary(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def ExecuteQueryWithStreamOutput(request, + def ExecuteQueryWithStreamOutput( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_stream( + request, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.unary_stream(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) @staticmethod - def ExecuteQueryWithStreamIO(request_iterator, + def ExecuteQueryWithStreamIO( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_stream( + request_iterator, target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None): - return grpc.experimental.stream_stream(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', + "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, channel_credentials, - insecure, call_credentials, compression, wait_for_ready, timeout, metadata) + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) From e42d10fa9ccf4296732941e9f1b333d692e83384 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:25:20 +0100 Subject: [PATCH 082/884] Revert "Obey Python's quirky formatter" This reverts commit 678a32cedee768b6c1a6748e96a0d103e853d8bc. --- tests/integration/ci-runner.py | 13 +- .../test_async_insert_memory/test.py | 2 +- tests/integration/test_check_table/test.py | 76 ++--- .../test_cluster_discovery/test.py | 2 +- .../test_ldap_external_user_directory/test.py | 26 +- tests/integration/test_mysql_protocol/test.py | 16 +- tests/integration/test_partition/test.py | 4 +- .../test_replicated_database/test.py | 9 +- .../test.py | 9 +- .../s3_mocks/unstable_server.py | 2 +- tests/integration/test_storage_s3/test.py | 17 +- tests/integration/test_storage_url/test.py | 22 +- tests/integration/test_system_merges/test.py | 45 +-- utils/grpc-client/pb2/clickhouse_grpc_pb2.py | 271 ++++++++---------- .../pb2/clickhouse_grpc_pb2_grpc.py | 237 ++++++--------- 15 files changed, 318 insertions(+), 433 deletions(-) diff --git a/tests/integration/ci-runner.py b/tests/integration/ci-runner.py index d54ed2bb767..7c922e339fe 100755 --- a/tests/integration/ci-runner.py +++ b/tests/integration/ci-runner.py @@ -305,11 +305,14 @@ class ClickhouseIntegrationTestsRunner: def _pre_pull_images(self, repo_path): image_cmd = self._get_runner_image_cmd(repo_path) - cmd = "cd {repo_path}/tests/integration && " "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( - repo_path=repo_path, - runner_opts=self._get_runner_opts(), - image_cmd=image_cmd, - command=r""" echo Pre Pull finished """, + cmd = ( + "cd {repo_path}/tests/integration && " + "timeout --signal=KILL 1h ./runner {runner_opts} {image_cmd} --pre-pull --command '{command}' ".format( + repo_path=repo_path, + runner_opts=self._get_runner_opts(), + image_cmd=image_cmd, + command=r""" echo Pre Pull finished """, + ) ) for i in range(5): diff --git a/tests/integration/test_async_insert_memory/test.py b/tests/integration/test_async_insert_memory/test.py index f897007f7bb..5d2e5503680 100644 --- a/tests/integration/test_async_insert_memory/test.py +++ b/tests/integration/test_async_insert_memory/test.py @@ -43,7 +43,7 @@ def test_memory_usage(): response = node.get_query_request( "SELECT groupArray(number) FROM numbers(1000000) SETTINGS max_memory_usage_for_user={}".format( - 30 * (2 ** 23) + 30 * (2**23) ), user="A", ) diff --git a/tests/integration/test_check_table/test.py b/tests/integration/test_check_table/test.py index ebf404e698b..021977fb6b6 100644 --- a/tests/integration/test_check_table/test.py +++ b/tests/integration/test_check_table/test.py @@ -95,25 +95,15 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): node1, "non_replicated_mt", "201902_1_1_0", database="default" ) - assert ( - node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - .strip() - .split("\t")[0:2] - == ["201902_1_1_0", "0"] - ) + assert node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] - assert ( - node1.query( - "CHECK TABLE non_replicated_mt", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - .strip() - .split("\t")[0:2] - == ["201902_1_1_0", "0"] - ) + assert node1.query( + "CHECK TABLE non_replicated_mt", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ).strip().split("\t")[0:2] == ["201902_1_1_0", "0"] node1.query( "INSERT INTO non_replicated_mt VALUES (toDate('2019-01-01'), 1, 10), (toDate('2019-01-01'), 2, 12)" @@ -133,15 +123,10 @@ def test_check_normal_table_corruption(started_cluster, merge_tree_settings): remove_checksums_on_disk(node1, "default", "non_replicated_mt", "201901_2_2_0") - assert ( - node1.query( - "CHECK TABLE non_replicated_mt PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - .strip() - .split("\t")[0:2] - == ["201901_2_2_0", "0"] - ) + assert node1.query( + "CHECK TABLE non_replicated_mt PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ).strip().split("\t")[0:2] == ["201901_2_2_0", "0"] @pytest.mark.parametrize("merge_tree_settings, zk_path_suffix", [("", "_0")]) @@ -209,15 +194,12 @@ def test_check_replicated_table_simple( == "201901_0_0_0\t1\t\n" ) - assert ( - sorted( - node2.query( - "CHECK TABLE replicated_mt", - settings={"check_query_single_value_result": 0}, - ).split("\n") - ) - == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] - ) + assert sorted( + node2.query( + "CHECK TABLE replicated_mt", + settings={"check_query_single_value_result": 0}, + ).split("\n") + ) == ["", "201901_0_0_0\t1\t", "201902_0_0_0\t1\t"] with pytest.raises(QueryRuntimeException) as exc: node2.query( @@ -291,13 +273,10 @@ def test_check_replicated_table_corruption( ) node1.query_with_retry("SYSTEM SYNC REPLICA replicated_mt_1") - assert ( - node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - == "{}\t1\t\n".format(part_name) - ) + assert node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) == "{}\t1\t\n".format(part_name) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" remove_part_from_disk(node2, "replicated_mt_1", part_name) @@ -309,13 +288,10 @@ def test_check_replicated_table_corruption( ) node1.query("SYSTEM SYNC REPLICA replicated_mt_1") - assert ( - node1.query( - "CHECK TABLE replicated_mt_1 PARTITION 201901", - settings={"check_query_single_value_result": 0, "max_threads": 1}, - ) - == "{}\t1\t\n".format(part_name) - ) + assert node1.query( + "CHECK TABLE replicated_mt_1 PARTITION 201901", + settings={"check_query_single_value_result": 0, "max_threads": 1}, + ) == "{}\t1\t\n".format(part_name) assert node1.query("SELECT count() from replicated_mt_1") == "4\n" diff --git a/tests/integration/test_cluster_discovery/test.py b/tests/integration/test_cluster_discovery/test.py index a2e7e15b956..ad3deb5b142 100644 --- a/tests/integration/test_cluster_discovery/test.py +++ b/tests/integration/test_cluster_discovery/test.py @@ -61,7 +61,7 @@ def check_on_cluster( print(f"Retry {retry}/{retries} unsuccessful, result: {node_results}") if retry != retries: - time.sleep(2 ** retry) + time.sleep(2**retry) else: msg = msg or f"Wrong '{what}' result" raise Exception( diff --git a/tests/integration/test_ldap_external_user_directory/test.py b/tests/integration/test_ldap_external_user_directory/test.py index c9642c293ee..39753794d63 100644 --- a/tests/integration/test_ldap_external_user_directory/test.py +++ b/tests/integration/test_ldap_external_user_directory/test.py @@ -76,14 +76,11 @@ def test_role_mapping(ldap_cluster): "select currentUser()", user="johndoe", password="qwertz" ) == TSV([["johndoe"]]) - assert ( - instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) - == TSV([["role_1"], ["role_2"]]) - ) + assert instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) == TSV([["role_1"], ["role_2"]]) instance.query("CREATE ROLE role_3") add_ldap_group(ldap_cluster, group_cn="clickhouse-role_3", member_cn="johndoe") @@ -91,11 +88,8 @@ def test_role_mapping(ldap_cluster): # See https://github.com/ClickHouse/ClickHouse/issues/54318 add_ldap_group(ldap_cluster, group_cn="clickhouse-role_4", member_cn="johndoe") - assert ( - instance.query( - "select role_name from system.current_roles ORDER BY role_name", - user="johndoe", - password="qwertz", - ) - == TSV([["role_1"], ["role_2"], ["role_3"]]) - ) + assert instance.query( + "select role_name from system.current_roles ORDER BY role_name", + user="johndoe", + password="qwertz", + ) == TSV([["role_1"], ["role_2"], ["role_3"]]) diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py index 61e76c0dc97..7a69d07633c 100644 --- a/tests/integration/test_mysql_protocol/test.py +++ b/tests/integration/test_mysql_protocol/test.py @@ -854,14 +854,14 @@ def test_types(started_cluster): result = cursor.fetchall()[0] expected = [ - ("Int8_column", -(2 ** 7)), - ("UInt8_column", 2 ** 8 - 1), - ("Int16_column", -(2 ** 15)), - ("UInt16_column", 2 ** 16 - 1), - ("Int32_column", -(2 ** 31)), - ("UInt32_column", 2 ** 32 - 1), - ("Int64_column", -(2 ** 63)), - ("UInt64_column", 2 ** 64 - 1), + ("Int8_column", -(2**7)), + ("UInt8_column", 2**8 - 1), + ("Int16_column", -(2**15)), + ("UInt16_column", 2**16 - 1), + ("Int32_column", -(2**31)), + ("UInt32_column", 2**32 - 1), + ("Int64_column", -(2**63)), + ("UInt64_column", 2**64 - 1), ("String_column", "тест"), ("FixedString_column", "тест"), ("Float32_column", 1.5), diff --git a/tests/integration/test_partition/test.py b/tests/integration/test_partition/test.py index d39787f8924..054418a8ba9 100644 --- a/tests/integration/test_partition/test.py +++ b/tests/integration/test_partition/test.py @@ -561,7 +561,9 @@ def test_make_clone_in_detached(started_cluster): ["cp", "-r", path + "all_0_0_0", path + "detached/broken_all_0_0_0"] ) assert_eq_with_retry(instance, "select * from clone_in_detached", "\n") - assert ["broken_all_0_0_0",] == sorted( + assert [ + "broken_all_0_0_0", + ] == sorted( instance.exec_in_container(["ls", path + "detached/"]).strip().split("\n") ) diff --git a/tests/integration/test_replicated_database/test.py b/tests/integration/test_replicated_database/test.py index 16425c9bd9e..1fc3fe37044 100644 --- a/tests/integration/test_replicated_database/test.py +++ b/tests/integration/test_replicated_database/test.py @@ -506,9 +506,12 @@ def test_alters_from_different_replicas(started_cluster): dummy_node.stop_clickhouse(kill=True) settings = {"distributed_ddl_task_timeout": 5} - assert "There are 1 unfinished hosts (0 of them are currently executing the task" in competing_node.query_and_get_error( - "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", - settings=settings, + assert ( + "There are 1 unfinished hosts (0 of them are currently executing the task" + in competing_node.query_and_get_error( + "ALTER TABLE alters_from_different_replicas.concurrent_test ADD COLUMN Added0 UInt32;", + settings=settings, + ) ) settings = { "distributed_ddl_task_timeout": 5, diff --git a/tests/integration/test_replicated_database_cluster_groups/test.py b/tests/integration/test_replicated_database_cluster_groups/test.py index 5a315707efb..647626d8014 100644 --- a/tests/integration/test_replicated_database_cluster_groups/test.py +++ b/tests/integration/test_replicated_database_cluster_groups/test.py @@ -95,9 +95,12 @@ def test_cluster_groups(started_cluster): # Exception main_node_2.stop_clickhouse() settings = {"distributed_ddl_task_timeout": 5} - assert "There are 1 unfinished hosts (0 of them are currently executing the task)" in main_node_1.query_and_get_error( - "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", - settings=settings, + assert ( + "There are 1 unfinished hosts (0 of them are currently executing the task)" + in main_node_1.query_and_get_error( + "CREATE TABLE cluster_groups.table_2 (d Date, k UInt64) ENGINE=ReplicatedMergeTree ORDER BY k PARTITION BY toYYYYMM(d);", + settings=settings, + ) ) # 3. After start both groups are synced diff --git a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py index 3632fa15d8a..5ef781bdc9e 100644 --- a/tests/integration/test_storage_s3/s3_mocks/unstable_server.py +++ b/tests/integration/test_storage_s3/s3_mocks/unstable_server.py @@ -9,7 +9,7 @@ import time def gen_n_digit_number(n): assert 0 < n < 19 - return random.randint(10 ** (n - 1), 10 ** n - 1) + return random.randint(10 ** (n - 1), 10**n - 1) sum_in_4_column = 0 diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index e941356261a..2549cb0d473 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -553,13 +553,16 @@ def test_multipart(started_cluster, maybe_auth, positive): assert csv_data == get_s3_file_content(started_cluster, bucket, filename) # select uploaded data from many threads - select_query = "select sum(column1), sum(column2), sum(column3) " "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( - host=started_cluster.minio_redirect_host, - port=started_cluster.minio_redirect_port, - bucket=bucket, - filename=filename, - auth=maybe_auth, - table_format=table_format, + select_query = ( + "select sum(column1), sum(column2), sum(column3) " + "from s3('http://{host}:{port}/{bucket}/{filename}', {auth}'CSV', '{table_format}')".format( + host=started_cluster.minio_redirect_host, + port=started_cluster.minio_redirect_port, + bucket=bucket, + filename=filename, + auth=maybe_auth, + table_format=table_format, + ) ) try: select_result = run_query( diff --git a/tests/integration/test_storage_url/test.py b/tests/integration/test_storage_url/test.py index 771df49cbac..7ff7a871413 100644 --- a/tests/integration/test_storage_url/test.py +++ b/tests/integration/test_storage_url/test.py @@ -79,21 +79,15 @@ def test_table_function_url_access_rights(): f"SELECT * FROM url('http://nginx:80/test_1', 'TSV')", user="u1" ) - assert ( - node1.query( - f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) - == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) - ) + assert node1.query( + f"DESCRIBE TABLE url('http://nginx:80/test_1', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) - assert ( - node1.query( - f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", - user="u1", - ) - == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) - ) + assert node1.query( + f"DESCRIBE TABLE url('http://nginx:80/not-exist', 'TSV', 'column1 UInt32, column2 UInt32, column3 UInt32')", + user="u1", + ) == TSV([["column1", "UInt32"], ["column2", "UInt32"], ["column3", "UInt32"]]) expected_error = "necessary to have the grant URL ON *.*" assert expected_error in node1.query_and_get_error( diff --git a/tests/integration/test_system_merges/test.py b/tests/integration/test_system_merges/test.py index bacb0eb500d..6dbe6c891f2 100644 --- a/tests/integration/test_system_merges/test.py +++ b/tests/integration/test_system_merges/test.py @@ -204,33 +204,36 @@ def test_mutation_simple(started_cluster, replicated): sleep_time=0.1, ) - assert split_tsv( - node_check.query( - """ + assert ( + split_tsv( + node_check.query( + """ SELECT database, table, num_parts, source_part_names, source_part_paths, result_part_name, result_part_path, partition_id, is_mutation FROM system.merges WHERE table = '{name}' """.format( - name=table_name + name=table_name + ) ) ) - ) == [ - [ - db_name, - table_name, - "1", - "['{}']".format(part), - "['{clickhouse}/{table_path}/{}/']".format( - part, clickhouse=clickhouse_path, table_path=table_path - ), - result_part, - "{clickhouse}/{table_path}/{}/".format( - result_part, clickhouse=clickhouse_path, table_path=table_path - ), - "all", - "1", - ], - ] + == [ + [ + db_name, + table_name, + "1", + "['{}']".format(part), + "['{clickhouse}/{table_path}/{}/']".format( + part, clickhouse=clickhouse_path, table_path=table_path + ), + result_part, + "{clickhouse}/{table_path}/{}/".format( + result_part, clickhouse=clickhouse_path, table_path=table_path + ), + "all", + "1", + ], + ] + ) t.join() assert ( diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py index 9bf7817c7d3..6218047af3c 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2.py @@ -8,17 +8,16 @@ from google.protobuf import descriptor_pool as _descriptor_pool from google.protobuf import message as _message from google.protobuf import reflection as _reflection from google.protobuf import symbol_database as _symbol_database - # @@protoc_insertion_point(imports) _sym_db = _symbol_database.Default() -DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result"\x00(\x01\x30\x01\x62\x06proto3' -) -_LOGSLEVEL = DESCRIPTOR.enum_types_by_name["LogsLevel"] + +DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\x15\x63lickhouse_grpc.proto\x12\x0f\x63lickhouse.grpc\")\n\x0bNameAndType\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x0c\n\x04type\x18\x02 \x01(\t\"\xf5\x01\n\rExternalTable\x12\x0c\n\x04name\x18\x01 \x01(\t\x12-\n\x07\x63olumns\x18\x02 \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x12\x0e\n\x06\x66ormat\x18\x04 \x01(\t\x12\x18\n\x10\x63ompression_type\x18\x06 \x01(\t\x12>\n\x08settings\x18\x05 \x03(\x0b\x32,.clickhouse.grpc.ExternalTable.SettingsEntry\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\x85\x03\n\x1cObsoleteTransportCompression\x12U\n\talgorithm\x18\x01 \x01(\x0e\x32\x42.clickhouse.grpc.ObsoleteTransportCompression.CompressionAlgorithm\x12M\n\x05level\x18\x02 \x01(\x0e\x32>.clickhouse.grpc.ObsoleteTransportCompression.CompressionLevel\"R\n\x14\x43ompressionAlgorithm\x12\x12\n\x0eNO_COMPRESSION\x10\x00\x12\x0b\n\x07\x44\x45\x46LATE\x10\x01\x12\x08\n\x04GZIP\x10\x02\x12\x0f\n\x0bSTREAM_GZIP\x10\x03\"k\n\x10\x43ompressionLevel\x12\x14\n\x10\x43OMPRESSION_NONE\x10\x00\x12\x13\n\x0f\x43OMPRESSION_LOW\x10\x01\x12\x16\n\x12\x43OMPRESSION_MEDIUM\x10\x02\x12\x14\n\x10\x43OMPRESSION_HIGH\x10\x03\"\x8e\x06\n\tQueryInfo\x12\r\n\x05query\x18\x01 \x01(\t\x12\x10\n\x08query_id\x18\x02 \x01(\t\x12:\n\x08settings\x18\x03 \x03(\x0b\x32(.clickhouse.grpc.QueryInfo.SettingsEntry\x12\x10\n\x08\x64\x61tabase\x18\x04 \x01(\t\x12\x12\n\ninput_data\x18\x05 \x01(\x0c\x12\x1c\n\x14input_data_delimiter\x18\x06 \x01(\x0c\x12\x15\n\routput_format\x18\x07 \x01(\t\x12\x1b\n\x13send_output_columns\x18\x18 \x01(\x08\x12\x37\n\x0f\x65xternal_tables\x18\x08 \x03(\x0b\x32\x1e.clickhouse.grpc.ExternalTable\x12\x11\n\tuser_name\x18\t \x01(\t\x12\x10\n\x08password\x18\n \x01(\t\x12\r\n\x05quota\x18\x0b \x01(\t\x12\x12\n\nsession_id\x18\x0c \x01(\t\x12\x15\n\rsession_check\x18\r \x01(\x08\x12\x17\n\x0fsession_timeout\x18\x0e \x01(\r\x12\x0e\n\x06\x63\x61ncel\x18\x0f \x01(\x08\x12\x17\n\x0fnext_query_info\x18\x10 \x01(\x08\x12\x1e\n\x16input_compression_type\x18\x14 \x01(\t\x12\x1f\n\x17output_compression_type\x18\x15 \x01(\t\x12 \n\x18output_compression_level\x18\x13 \x01(\x05\x12\"\n\x1atransport_compression_type\x18\x16 \x01(\t\x12#\n\x1btransport_compression_level\x18\x17 \x01(\x05\x12R\n\x1bobsolete_result_compression\x18\x11 \x01(\x0b\x32-.clickhouse.grpc.ObsoleteTransportCompression\x12!\n\x19obsolete_compression_type\x18\x12 \x01(\t\x1a/\n\rSettingsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"\xa1\x01\n\x08LogEntry\x12\x0c\n\x04time\x18\x01 \x01(\r\x12\x19\n\x11time_microseconds\x18\x02 \x01(\r\x12\x11\n\tthread_id\x18\x03 \x01(\x04\x12\x10\n\x08query_id\x18\x04 \x01(\t\x12)\n\x05level\x18\x05 \x01(\x0e\x32\x1a.clickhouse.grpc.LogsLevel\x12\x0e\n\x06source\x18\x06 \x01(\t\x12\x0c\n\x04text\x18\x07 \x01(\t\"z\n\x08Progress\x12\x11\n\tread_rows\x18\x01 \x01(\x04\x12\x12\n\nread_bytes\x18\x02 \x01(\x04\x12\x1a\n\x12total_rows_to_read\x18\x03 \x01(\x04\x12\x14\n\x0cwritten_rows\x18\x04 \x01(\x04\x12\x15\n\rwritten_bytes\x18\x05 \x01(\x04\"p\n\x05Stats\x12\x0c\n\x04rows\x18\x01 \x01(\x04\x12\x0e\n\x06\x62locks\x18\x02 \x01(\x04\x12\x17\n\x0f\x61llocated_bytes\x18\x03 \x01(\x04\x12\x15\n\rapplied_limit\x18\x04 \x01(\x08\x12\x19\n\x11rows_before_limit\x18\x05 \x01(\x04\"R\n\tException\x12\x0c\n\x04\x63ode\x18\x01 \x01(\x05\x12\x0c\n\x04name\x18\x02 \x01(\t\x12\x14\n\x0c\x64isplay_text\x18\x03 \x01(\t\x12\x13\n\x0bstack_trace\x18\x04 \x01(\t\"\xeb\x02\n\x06Result\x12\x10\n\x08query_id\x18\t \x01(\t\x12\x11\n\ttime_zone\x18\n \x01(\t\x12\x15\n\routput_format\x18\x0b \x01(\t\x12\x34\n\x0eoutput_columns\x18\x0c \x03(\x0b\x32\x1c.clickhouse.grpc.NameAndType\x12\x0e\n\x06output\x18\x01 \x01(\x0c\x12\x0e\n\x06totals\x18\x02 \x01(\x0c\x12\x10\n\x08\x65xtremes\x18\x03 \x01(\x0c\x12\'\n\x04logs\x18\x04 \x03(\x0b\x32\x19.clickhouse.grpc.LogEntry\x12+\n\x08progress\x18\x05 \x01(\x0b\x32\x19.clickhouse.grpc.Progress\x12%\n\x05stats\x18\x06 \x01(\x0b\x32\x16.clickhouse.grpc.Stats\x12-\n\texception\x18\x07 \x01(\x0b\x32\x1a.clickhouse.grpc.Exception\x12\x11\n\tcancelled\x18\x08 \x01(\x08*\x9d\x01\n\tLogsLevel\x12\x0c\n\x08LOG_NONE\x10\x00\x12\r\n\tLOG_FATAL\x10\x01\x12\x10\n\x0cLOG_CRITICAL\x10\x02\x12\r\n\tLOG_ERROR\x10\x03\x12\x0f\n\x0bLOG_WARNING\x10\x04\x12\x0e\n\nLOG_NOTICE\x10\x05\x12\x13\n\x0fLOG_INFORMATION\x10\x06\x12\r\n\tLOG_DEBUG\x10\x07\x12\r\n\tLOG_TRACE\x10\x08\x32\xdb\x02\n\nClickHouse\x12\x45\n\x0c\x45xecuteQuery\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x12V\n\x1b\x45xecuteQueryWithStreamInput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x12W\n\x1c\x45xecuteQueryWithStreamOutput\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00\x30\x01\x12U\n\x18\x45xecuteQueryWithStreamIO\x12\x1a.clickhouse.grpc.QueryInfo\x1a\x17.clickhouse.grpc.Result\"\x00(\x01\x30\x01\x62\x06proto3') + +_LOGSLEVEL = DESCRIPTOR.enum_types_by_name['LogsLevel'] LogsLevel = enum_type_wrapper.EnumTypeWrapper(_LOGSLEVEL) LOG_NONE = 0 LOG_FATAL = 1 @@ -31,180 +30,134 @@ LOG_DEBUG = 7 LOG_TRACE = 8 -_NAMEANDTYPE = DESCRIPTOR.message_types_by_name["NameAndType"] -_EXTERNALTABLE = DESCRIPTOR.message_types_by_name["ExternalTable"] -_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name["SettingsEntry"] -_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name[ - "ObsoleteTransportCompression" -] -_QUERYINFO = DESCRIPTOR.message_types_by_name["QueryInfo"] -_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name["SettingsEntry"] -_LOGENTRY = DESCRIPTOR.message_types_by_name["LogEntry"] -_PROGRESS = DESCRIPTOR.message_types_by_name["Progress"] -_STATS = DESCRIPTOR.message_types_by_name["Stats"] -_EXCEPTION = DESCRIPTOR.message_types_by_name["Exception"] -_RESULT = DESCRIPTOR.message_types_by_name["Result"] -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = ( - _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionAlgorithm"] -) -_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = ( - _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name["CompressionLevel"] -) -NameAndType = _reflection.GeneratedProtocolMessageType( - "NameAndType", - (_message.Message,), - { - "DESCRIPTOR": _NAMEANDTYPE, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) - }, -) +_NAMEANDTYPE = DESCRIPTOR.message_types_by_name['NameAndType'] +_EXTERNALTABLE = DESCRIPTOR.message_types_by_name['ExternalTable'] +_EXTERNALTABLE_SETTINGSENTRY = _EXTERNALTABLE.nested_types_by_name['SettingsEntry'] +_OBSOLETETRANSPORTCOMPRESSION = DESCRIPTOR.message_types_by_name['ObsoleteTransportCompression'] +_QUERYINFO = DESCRIPTOR.message_types_by_name['QueryInfo'] +_QUERYINFO_SETTINGSENTRY = _QUERYINFO.nested_types_by_name['SettingsEntry'] +_LOGENTRY = DESCRIPTOR.message_types_by_name['LogEntry'] +_PROGRESS = DESCRIPTOR.message_types_by_name['Progress'] +_STATS = DESCRIPTOR.message_types_by_name['Stats'] +_EXCEPTION = DESCRIPTOR.message_types_by_name['Exception'] +_RESULT = DESCRIPTOR.message_types_by_name['Result'] +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionAlgorithm'] +_OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL = _OBSOLETETRANSPORTCOMPRESSION.enum_types_by_name['CompressionLevel'] +NameAndType = _reflection.GeneratedProtocolMessageType('NameAndType', (_message.Message,), { + 'DESCRIPTOR' : _NAMEANDTYPE, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.NameAndType) + }) _sym_db.RegisterMessage(NameAndType) -ExternalTable = _reflection.GeneratedProtocolMessageType( - "ExternalTable", - (_message.Message,), - { - "SettingsEntry": _reflection.GeneratedProtocolMessageType( - "SettingsEntry", - (_message.Message,), - { - "DESCRIPTOR": _EXTERNALTABLE_SETTINGSENTRY, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) - }, - ), - "DESCRIPTOR": _EXTERNALTABLE, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) - }, -) +ExternalTable = _reflection.GeneratedProtocolMessageType('ExternalTable', (_message.Message,), { + + 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { + 'DESCRIPTOR' : _EXTERNALTABLE_SETTINGSENTRY, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable.SettingsEntry) + }) + , + 'DESCRIPTOR' : _EXTERNALTABLE, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ExternalTable) + }) _sym_db.RegisterMessage(ExternalTable) _sym_db.RegisterMessage(ExternalTable.SettingsEntry) -ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType( - "ObsoleteTransportCompression", - (_message.Message,), - { - "DESCRIPTOR": _OBSOLETETRANSPORTCOMPRESSION, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) - }, -) +ObsoleteTransportCompression = _reflection.GeneratedProtocolMessageType('ObsoleteTransportCompression', (_message.Message,), { + 'DESCRIPTOR' : _OBSOLETETRANSPORTCOMPRESSION, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.ObsoleteTransportCompression) + }) _sym_db.RegisterMessage(ObsoleteTransportCompression) -QueryInfo = _reflection.GeneratedProtocolMessageType( - "QueryInfo", - (_message.Message,), - { - "SettingsEntry": _reflection.GeneratedProtocolMessageType( - "SettingsEntry", - (_message.Message,), - { - "DESCRIPTOR": _QUERYINFO_SETTINGSENTRY, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) - }, - ), - "DESCRIPTOR": _QUERYINFO, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) - }, -) +QueryInfo = _reflection.GeneratedProtocolMessageType('QueryInfo', (_message.Message,), { + + 'SettingsEntry' : _reflection.GeneratedProtocolMessageType('SettingsEntry', (_message.Message,), { + 'DESCRIPTOR' : _QUERYINFO_SETTINGSENTRY, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo.SettingsEntry) + }) + , + 'DESCRIPTOR' : _QUERYINFO, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.QueryInfo) + }) _sym_db.RegisterMessage(QueryInfo) _sym_db.RegisterMessage(QueryInfo.SettingsEntry) -LogEntry = _reflection.GeneratedProtocolMessageType( - "LogEntry", - (_message.Message,), - { - "DESCRIPTOR": _LOGENTRY, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) - }, -) +LogEntry = _reflection.GeneratedProtocolMessageType('LogEntry', (_message.Message,), { + 'DESCRIPTOR' : _LOGENTRY, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.LogEntry) + }) _sym_db.RegisterMessage(LogEntry) -Progress = _reflection.GeneratedProtocolMessageType( - "Progress", - (_message.Message,), - { - "DESCRIPTOR": _PROGRESS, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) - }, -) +Progress = _reflection.GeneratedProtocolMessageType('Progress', (_message.Message,), { + 'DESCRIPTOR' : _PROGRESS, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Progress) + }) _sym_db.RegisterMessage(Progress) -Stats = _reflection.GeneratedProtocolMessageType( - "Stats", - (_message.Message,), - { - "DESCRIPTOR": _STATS, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) - }, -) +Stats = _reflection.GeneratedProtocolMessageType('Stats', (_message.Message,), { + 'DESCRIPTOR' : _STATS, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Stats) + }) _sym_db.RegisterMessage(Stats) -Exception = _reflection.GeneratedProtocolMessageType( - "Exception", - (_message.Message,), - { - "DESCRIPTOR": _EXCEPTION, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) - }, -) +Exception = _reflection.GeneratedProtocolMessageType('Exception', (_message.Message,), { + 'DESCRIPTOR' : _EXCEPTION, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Exception) + }) _sym_db.RegisterMessage(Exception) -Result = _reflection.GeneratedProtocolMessageType( - "Result", - (_message.Message,), - { - "DESCRIPTOR": _RESULT, - "__module__": "clickhouse_grpc_pb2" - # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) - }, -) +Result = _reflection.GeneratedProtocolMessageType('Result', (_message.Message,), { + 'DESCRIPTOR' : _RESULT, + '__module__' : 'clickhouse_grpc_pb2' + # @@protoc_insertion_point(class_scope:clickhouse.grpc.Result) + }) _sym_db.RegisterMessage(Result) -_CLICKHOUSE = DESCRIPTOR.services_by_name["ClickHouse"] +_CLICKHOUSE = DESCRIPTOR.services_by_name['ClickHouse'] if _descriptor._USE_C_DESCRIPTORS == False: - DESCRIPTOR._options = None - _EXTERNALTABLE_SETTINGSENTRY._options = None - _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b"8\001" - _QUERYINFO_SETTINGSENTRY._options = None - _QUERYINFO_SETTINGSENTRY._serialized_options = b"8\001" - _LOGSLEVEL._serialized_start = 2363 - _LOGSLEVEL._serialized_end = 2520 - _NAMEANDTYPE._serialized_start = 42 - _NAMEANDTYPE._serialized_end = 83 - _EXTERNALTABLE._serialized_start = 86 - _EXTERNALTABLE._serialized_end = 331 - _EXTERNALTABLE_SETTINGSENTRY._serialized_start = 284 - _EXTERNALTABLE_SETTINGSENTRY._serialized_end = 331 - _OBSOLETETRANSPORTCOMPRESSION._serialized_start = 334 - _OBSOLETETRANSPORTCOMPRESSION._serialized_end = 723 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start = 532 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end = 614 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start = 616 - _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end = 723 - _QUERYINFO._serialized_start = 726 - _QUERYINFO._serialized_end = 1508 - _QUERYINFO_SETTINGSENTRY._serialized_start = 284 - _QUERYINFO_SETTINGSENTRY._serialized_end = 331 - _LOGENTRY._serialized_start = 1511 - _LOGENTRY._serialized_end = 1672 - _PROGRESS._serialized_start = 1674 - _PROGRESS._serialized_end = 1796 - _STATS._serialized_start = 1798 - _STATS._serialized_end = 1910 - _EXCEPTION._serialized_start = 1912 - _EXCEPTION._serialized_end = 1994 - _RESULT._serialized_start = 1997 - _RESULT._serialized_end = 2360 - _CLICKHOUSE._serialized_start = 2523 - _CLICKHOUSE._serialized_end = 2870 + DESCRIPTOR._options = None + _EXTERNALTABLE_SETTINGSENTRY._options = None + _EXTERNALTABLE_SETTINGSENTRY._serialized_options = b'8\001' + _QUERYINFO_SETTINGSENTRY._options = None + _QUERYINFO_SETTINGSENTRY._serialized_options = b'8\001' + _LOGSLEVEL._serialized_start=2363 + _LOGSLEVEL._serialized_end=2520 + _NAMEANDTYPE._serialized_start=42 + _NAMEANDTYPE._serialized_end=83 + _EXTERNALTABLE._serialized_start=86 + _EXTERNALTABLE._serialized_end=331 + _EXTERNALTABLE_SETTINGSENTRY._serialized_start=284 + _EXTERNALTABLE_SETTINGSENTRY._serialized_end=331 + _OBSOLETETRANSPORTCOMPRESSION._serialized_start=334 + _OBSOLETETRANSPORTCOMPRESSION._serialized_end=723 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_start=532 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONALGORITHM._serialized_end=614 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_start=616 + _OBSOLETETRANSPORTCOMPRESSION_COMPRESSIONLEVEL._serialized_end=723 + _QUERYINFO._serialized_start=726 + _QUERYINFO._serialized_end=1508 + _QUERYINFO_SETTINGSENTRY._serialized_start=284 + _QUERYINFO_SETTINGSENTRY._serialized_end=331 + _LOGENTRY._serialized_start=1511 + _LOGENTRY._serialized_end=1672 + _PROGRESS._serialized_start=1674 + _PROGRESS._serialized_end=1796 + _STATS._serialized_start=1798 + _STATS._serialized_end=1910 + _EXCEPTION._serialized_start=1912 + _EXCEPTION._serialized_end=1994 + _RESULT._serialized_start=1997 + _RESULT._serialized_end=2360 + _CLICKHOUSE._serialized_start=2523 + _CLICKHOUSE._serialized_end=2870 # @@protoc_insertion_point(module_scope) diff --git a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py index 25643a243b3..1c71218bbe5 100644 --- a/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py +++ b/utils/grpc-client/pb2/clickhouse_grpc_pb2_grpc.py @@ -15,25 +15,25 @@ class ClickHouseStub(object): channel: A grpc.Channel. """ self.ExecuteQuery = channel.unary_unary( - "/clickhouse.grpc.ClickHouse/ExecuteQuery", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQuery', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamInput = channel.stream_unary( - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamOutput = channel.unary_stream( - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) self.ExecuteQueryWithStreamIO = channel.stream_stream( - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", - request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, - response_deserializer=clickhouse__grpc__pb2.Result.FromString, - ) + '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', + request_serializer=clickhouse__grpc__pb2.QueryInfo.SerializeToString, + response_deserializer=clickhouse__grpc__pb2.Result.FromString, + ) class ClickHouseServicer(object): @@ -42,173 +42,124 @@ class ClickHouseServicer(object): def ExecuteQuery(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def ExecuteQueryWithStreamInput(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def ExecuteQueryWithStreamOutput(self, request, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def ExecuteQueryWithStreamIO(self, request_iterator, context): """Missing associated documentation comment in .proto file.""" context.set_code(grpc.StatusCode.UNIMPLEMENTED) - context.set_details("Method not implemented!") - raise NotImplementedError("Method not implemented!") + context.set_details('Method not implemented!') + raise NotImplementedError('Method not implemented!') def add_ClickHouseServicer_to_server(servicer, server): rpc_method_handlers = { - "ExecuteQuery": grpc.unary_unary_rpc_method_handler( - servicer.ExecuteQuery, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - "ExecuteQueryWithStreamInput": grpc.stream_unary_rpc_method_handler( - servicer.ExecuteQueryWithStreamInput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - "ExecuteQueryWithStreamOutput": grpc.unary_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamOutput, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), - "ExecuteQueryWithStreamIO": grpc.stream_stream_rpc_method_handler( - servicer.ExecuteQueryWithStreamIO, - request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, - response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, - ), + 'ExecuteQuery': grpc.unary_unary_rpc_method_handler( + servicer.ExecuteQuery, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + 'ExecuteQueryWithStreamInput': grpc.stream_unary_rpc_method_handler( + servicer.ExecuteQueryWithStreamInput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + 'ExecuteQueryWithStreamOutput': grpc.unary_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamOutput, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), + 'ExecuteQueryWithStreamIO': grpc.stream_stream_rpc_method_handler( + servicer.ExecuteQueryWithStreamIO, + request_deserializer=clickhouse__grpc__pb2.QueryInfo.FromString, + response_serializer=clickhouse__grpc__pb2.Result.SerializeToString, + ), } generic_handler = grpc.method_handlers_generic_handler( - "clickhouse.grpc.ClickHouse", rpc_method_handlers - ) + 'clickhouse.grpc.ClickHouse', rpc_method_handlers) server.add_generic_rpc_handlers((generic_handler,)) -# This class is part of an EXPERIMENTAL API. + # This class is part of an EXPERIMENTAL API. class ClickHouse(object): """Missing associated documentation comment in .proto file.""" @staticmethod - def ExecuteQuery( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_unary( - request, + def ExecuteQuery(request, target, - "/clickhouse.grpc.ClickHouse/ExecuteQuery", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_unary(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQuery', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def ExecuteQueryWithStreamInput( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_unary( - request_iterator, + def ExecuteQueryWithStreamInput(request_iterator, target, - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_unary(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamInput', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def ExecuteQueryWithStreamOutput( - request, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.unary_stream( - request, + def ExecuteQueryWithStreamOutput(request, target, - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.unary_stream(request, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamOutput', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) @staticmethod - def ExecuteQueryWithStreamIO( - request_iterator, - target, - options=(), - channel_credentials=None, - call_credentials=None, - insecure=False, - compression=None, - wait_for_ready=None, - timeout=None, - metadata=None, - ): - return grpc.experimental.stream_stream( - request_iterator, + def ExecuteQueryWithStreamIO(request_iterator, target, - "/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO", + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None): + return grpc.experimental.stream_stream(request_iterator, target, '/clickhouse.grpc.ClickHouse/ExecuteQueryWithStreamIO', clickhouse__grpc__pb2.QueryInfo.SerializeToString, clickhouse__grpc__pb2.Result.FromString, - options, - channel_credentials, - insecure, - call_credentials, - compression, - wait_for_ready, - timeout, - metadata, - ) + options, channel_credentials, + insecure, call_credentials, compression, wait_for_ready, timeout, metadata) From 5cda358e62c90a2345a60a249b6d7e8430f6454d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:26:04 +0100 Subject: [PATCH 083/884] Obey Python's quirky formatter --- tests/clickhouse-test | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index bd796dbfdf2..dd9047c293f 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -75,11 +75,13 @@ def stringhash(s): # only during process invocation https://stackoverflow.com/a/42089311 return zlib.crc32(s.encode("utf-8")) + def read_file_as_binary_string(file_path): - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: binary_data = file.read() return binary_data + # First and last lines of the log def trim_for_log(s): if not s: @@ -146,7 +148,7 @@ def clickhouse_execute_http( client.request( "POST", f"/?{base_args.client_options_query_str}{urllib.parse.urlencode(params)}", - body=body + body=body, ) res = client.getresponse() data = res.read() From 71bef27abfa9cd64a318306ddd11b21b907a37ac Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 06:26:54 +0100 Subject: [PATCH 084/884] Follow-up --- docker/test/base/setup_export_logs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 26fcd10d666..96a15c33674 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -194,7 +194,7 @@ function setup_logs_replication echo "Creating table system.${table}_sender" >&2 # Create Distributed table and materialized view to watch on the original table: - clickhouse-client --asterisk_include_materialized_columns 1 --query " + clickhouse-client --query " CREATE TABLE system.${table}_sender ENGINE = Distributed(${CLICKHOUSE_CI_LOGS_CLUSTER}, default, ${table}_${hash}) SETTINGS flush_on_detach=0 @@ -205,7 +205,7 @@ function setup_logs_replication echo "Creating materialized view system.${table}_watcher" >&2 - clickhouse-client --asterisk_include_materialized_columns 1 --query " + clickhouse-client --query " CREATE MATERIALIZED VIEW system.${table}_watcher TO system.${table}_sender AS SELECT ${EXTRA_COLUMNS_EXPRESSION_FOR_TABLE}, * FROM system.${table} From c5024a5f6d7f88f0fd8dc2af2c52eb1c1d57d2c2 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Mon, 15 Jan 2024 23:36:48 +0100 Subject: [PATCH 085/884] Fix typo --- docker/test/base/setup_export_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 96a15c33674..416281c2aa3 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -30,7 +30,7 @@ EXTRA_COLUMNS_EXPRESSION_COVERAGE_LOG="${EXTRA_COLUMNS_EXPRESSION}, arrayMap(x - function __set_connection_args { - # It's impossible to use generous $CONNECTION_ARGS string, it's unsafe from word splitting perspective. + # It's impossible to use a generic $CONNECTION_ARGS string, it's unsafe from word splitting perspective. # That's why we must stick to the generated option CONNECTION_ARGS=( --receive_timeout=45 --send_timeout=45 --secure From 1caef191436fc05856be3b85b19cfcd97d0dc804 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jan 2024 09:44:52 +0100 Subject: [PATCH 086/884] Maybe better --- programs/main.cpp | 81 ++++++++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/programs/main.cpp b/programs/main.cpp index 8958d84e243..1ff7e5db560 100644 --- a/programs/main.cpp +++ b/programs/main.cpp @@ -394,6 +394,50 @@ void checkHarmfulEnvironmentVariables(char ** argv) } #endif + +#if defined(SANITIZE_COVERAGE) +__attribute__((no_sanitize("coverage"))) +void dumpCoverage() +{ + /// A user can request to dump the coverage information into files at exit. + /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, + /// that cannot introspect it with SQL functions at runtime. + + /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' + /// containing the list of addresses of covered . + + /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. + + if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) + { + auto dump = [](const std::string & name, auto span) + { + /// Write only non-zeros. + std::vector data; + data.reserve(span.size()); + for (auto addr : span) + if (addr) + data.push_back(addr); + + int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); + if (-1 == fd) + { + writeError("Cannot open a file to write the coverage data\n"); + } + else + { + if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) + writeError("Cannot write the coverage data to a file\n"); + if (0 != ::close(fd)) + writeError("Cannot close the file with coverage data\n"); + } + }; + + dump(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); + } +} +#endif + } bool isClickhouseApp(std::string_view app_suffix, std::vector & argv) @@ -517,42 +561,7 @@ int main(int argc_, char ** argv_) int exit_code = main_func(static_cast(argv.size()), argv.data()); #if defined(SANITIZE_COVERAGE) - /// A user can request to dump the coverage information into files at exit. - /// This is useful for non-server applications such as clickhouse-format or clickhouse-client, - /// that cannot introspect it with SQL functions at runtime. - - /// The CLICKHOUSE_WRITE_COVERAGE environment variable defines a prefix for a filename 'prefix.pid' - /// containing the list of addresses of covered . - - /// The format is even simpler than Clang's "sancov": an array of 64-bit addresses, native byte order, no header. - - if (const char * coverage_filename_prefix = getenv("CLICKHOUSE_WRITE_COVERAGE")) // NOLINT(concurrency-mt-unsafe) - { - auto dumpCoverage = [](const std::string & name, auto span) - { - /// Write only non-zeros. - std::vector data; - data.reserve(span.size()); - for (auto addr : span) - if (addr) - data.push_back(addr); - - int fd = ::open(name.c_str(), O_WRONLY | O_CREAT | O_TRUNC | O_CLOEXEC, 0400); - if (-1 == fd) - { - writeError("Cannot open a file to write the coverage data\n"); - } - else - { - if (!writeRetry(fd, reinterpret_cast(data.data()), data.size() * sizeof(data[0]))) - writeError("Cannot write the coverage data to a file\n"); - if (0 != ::close(fd)) - writeError("Cannot close the file with coverage data\n"); - } - }; - - dumpCoverage(fmt::format("{}.{}", coverage_filename_prefix, getpid()), getCumulativeCoverage()); - } + dumpCoverage(); #endif return exit_code; From 21082be9a681166b5585445c8aed62e705063081 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jan 2024 10:41:13 +0100 Subject: [PATCH 087/884] Better test --- tests/clickhouse-test | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index dd9047c293f..6d398115d43 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -1269,13 +1269,17 @@ class TestCase: file_pattern = "coverage.*" matching_files = glob.glob(file_pattern) for file_path in matching_files: - body = read_file_as_binary_string(file_path) - clickhouse_execute( - args, - f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", - body=body, - retry_error_codes=True, - ) + try: + body = read_file_as_binary_string(file_path) + clickhouse_execute( + args, + f"INSERT INTO system.coverage_log SELECT now(), '{self.case}', groupArray(data) FROM input('data UInt64') FORMAT RowBinary", + body=body, + retry_error_codes=True, + ) + except Exception as e: + print("Cannot insert coverage data: ", str(e)) + # Remove the file even in case of exception to avoid accumulation and quadratic complexity. os.remove(file_path) coverage = clickhouse_execute( From 6b8d53a9fa54e53c766c431201ea8dfd742630ea Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 16 Jan 2024 11:07:30 +0100 Subject: [PATCH 088/884] Remove obsolete comment --- tests/clickhouse-test | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index 6d398115d43..02693b997b4 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -412,7 +412,6 @@ def get_stacktraces_from_gdb(server_pid): # collect server stacktraces from system.stack_trace table -# it does not work in Sandbox def get_stacktraces_from_clickhouse(args): settings_str = " ".join( [ From 605c76e66ea5bdd2644026a5c7425e87f24c3702 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 16 Jan 2024 11:22:27 +0100 Subject: [PATCH 089/884] Fix test fails --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 0dc3026afc0..b235918c438 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -6400,23 +6400,27 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, { String database_name = scope.context->getCurrentDatabase(); - String table_name = table_function_node->getOriginalAST()->as()->name; + String table_name = ""; - if (table_function_node->getOriginalAST()->as()->is_compound_name) + if (table_function_node->getOriginalAST() && table_function_node->getOriginalAST()->as()) { - std::vector parts; - splitInto<'.'>(parts, table_function_node->getOriginalAST()->as()->name); - - if (parts.size() == 2) + table_name = table_function_node->getOriginalAST()->as()->name; + if (table_function_node->getOriginalAST()->as()->is_compound_name) { - database_name = parts[0]; - table_name = parts[1]; + std::vector parts; + splitInto<'.'>(parts, table_function_node->getOriginalAST()->as()->name); + + if (parts.size() == 2) + { + database_name = parts[0]; + table_name = parts[1]; + } } } auto & table_function_node_typed = table_function_node->as(); - StoragePtr table = DatabaseCatalog::instance().tryGetTable({database_name, table_name}, scope.context->getQueryContext()); + StoragePtr table = table_name.empty() ? nullptr : DatabaseCatalog::instance().tryGetTable({database_name, table_name}, scope.context->getQueryContext()); if (table) { if (table.get()->isView() && table->as() && table->as()->isParameterizedView()) From 8d88f4cf87d13c6760a5235abf4180102daf8b5c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 17 Jan 2024 09:42:53 +0100 Subject: [PATCH 090/884] Update setting is_parameterized_view & settings columns for view --- src/Interpreters/InterpreterCreateQuery.cpp | 5 ++++- src/Storages/StorageView.cpp | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 7985785aa9f..6031c8b4e46 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -780,8 +780,11 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti properties.constraints = as_storage_metadata->getConstraints(); } - else if (create.select && !create.isParameterizedView()) + else if (create.select) { + if (create.isParameterizedView()) + return properties; + Block as_select_sample; if (getContext()->getSettingsRef().allow_experimental_analyzer) diff --git a/src/Storages/StorageView.cpp b/src/Storages/StorageView.cpp index 1898e49de86..6b80e2450c4 100644 --- a/src/Storages/StorageView.cpp +++ b/src/Storages/StorageView.cpp @@ -112,7 +112,7 @@ StorageView::StorageView( : IStorage(table_id_) { StorageInMemoryMetadata storage_metadata; - if (is_parameterized_view_) + if (!is_parameterized_view_) { if (!query.isParameterizedView()) storage_metadata.setColumns(columns_); From d3b4dea8058e1cccb34bf39b3f26b4c0e5b2368a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 17 Jan 2024 20:02:17 +0100 Subject: [PATCH 091/884] Fix clang tidy build --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 2 +- src/Interpreters/InterpreterCreateQuery.cpp | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index d2270ea9910..7322d53d831 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -6401,7 +6401,7 @@ void QueryAnalyzer::resolveTableFunction(QueryTreeNodePtr & table_function_node, { String database_name = scope.context->getCurrentDatabase(); - String table_name = ""; + String table_name; if (table_function_node->getOriginalAST() && table_function_node->getOriginalAST()->as()) { diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp index 6031c8b4e46..e71946caafe 100644 --- a/src/Interpreters/InterpreterCreateQuery.cpp +++ b/src/Interpreters/InterpreterCreateQuery.cpp @@ -809,11 +809,7 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti * for example: LIMIT, OFFSET, functions parameters, functions constant only arguments. */ - SelectQueryOptions options; - if (create.isParameterizedView()) - options = options.createParameterizedView(); - - InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), options); + InterpreterSelectWithUnionQuery interpreter(create.select->clone(), getContext(), SelectQueryOptions()); as_select_sample = interpreter.getSampleBlock(); } From e3f5058f6129badab2e0071e86f51ffb77e57ce5 Mon Sep 17 00:00:00 2001 From: MyroTk <44327070+MyroTk@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:13:15 -0800 Subject: [PATCH 092/884] Update Dockerfile --- docker/test/integration/runner/Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index c795fbf0672..2a81db78a3d 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -94,6 +94,7 @@ RUN python3 -m pip install --no-cache-dir \ pytest-repeat \ pytest-timeout \ pytest-xdist \ + pytest-reportlog==0.4.0 \ pytz \ pyyaml==5.3.1 \ redis \ From f89803ebf65d7590e73816052b7ac2de81e04864 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Wed, 17 Jan 2024 23:17:53 +0100 Subject: [PATCH 093/884] Slightly better --- docker/test/base/setup_export_logs.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 416281c2aa3..043adf99ffc 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -156,7 +156,8 @@ function setup_logs_replication # Do not try to resolve stack traces in case of debug/sanitizers # build, since it is too slow (flushing of trace_log can take ~1min # with such MV attached) - if [[ "$debug_or_sanitizer_build" = 1 ]]; then + if [[ "$debug_or_sanitizer_build" = 1 ]] + then EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION}" else EXTRA_COLUMNS_EXPRESSION_FOR_TABLE="${EXTRA_COLUMNS_EXPRESSION_TRACE_LOG}" @@ -180,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"'/; + s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)'/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From f9ca4e3b8541d7db85effa3f9be286f7ad916965 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 00:52:05 +0100 Subject: [PATCH 094/884] Slightly better --- docker/test/base/setup_export_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 043adf99ffc..7033d4b52e2 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -181,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)'/; + s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From 61101d1a577b441931ef74b24d449b085d0f0ec3 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 01:16:50 +0100 Subject: [PATCH 095/884] Add a release build with coverage, turn off coverage in the debug build --- .github/workflows/master.yml | 8 ++++++++ .github/workflows/pull_request.yml | 8 ++++++++ tests/ci/ci_config.py | 17 +++++++++++++---- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index d2865eb737d..50d3eb4a062 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -98,6 +98,14 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + BuilderDebReleaseCoverage: + needs: [ RunConfig, BuildDockers ] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release_coverage + checkout_depth: 0 + data: ${{ needs.RunConfig.outputs.data }} BuilderDebAarch64: needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index bd2b2b60904..7f843f82c01 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -146,6 +146,14 @@ jobs: build_name: package_release checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} + BuilderDebReleaseCoverage: + needs: [ RunConfig, FastTest ] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_build.yml + with: + build_name: package_release_coverage + checkout_depth: 0 + data: ${{ needs.RunConfig.outputs.data }} BuilderDebAarch64: needs: [RunConfig, FastTest] if: ${{ !failure() && !cancelled() }} diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index b8dff3f0a28..1ca4e06bc8c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -474,6 +474,12 @@ CI_CONFIG = CiConfig( name="package_debug", compiler="clang-17", debug_build=True, + package_type="deb", + sparse_checkout=True, + ), + "package_release_coverage": BuildConfig( + name="package_release_coverage", + compiler="clang-17", coverage=True, package_type="deb", sparse_checkout=True, @@ -571,6 +577,7 @@ CI_CONFIG = CiConfig( "package_tsan", "package_msan", "package_debug", + "package_release_coverage", "binary_release", "fuzzers", ] @@ -660,16 +667,15 @@ CI_CONFIG = CiConfig( "Stateful tests (release)": TestConfig( "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), + "Stateful tests (coverage)": TestConfig( + "package_release_coverage", job_config=JobConfig(**stateful_test_common_params) # type: ignore + ), "Stateful tests (aarch64)": TestConfig( "package_aarch64", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), "Stateful tests (release, DatabaseOrdinary)": TestConfig( "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), - # "Stateful tests (release, DatabaseReplicated)": TestConfig( - # "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore - # ), - # Stateful tests for parallel replicas "Stateful tests (release, ParallelReplicas)": TestConfig( "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore ), @@ -712,6 +718,9 @@ CI_CONFIG = CiConfig( "Stateless tests (release)": TestConfig( "package_release", job_config=JobConfig(**statless_test_common_params) # type: ignore ), + "Stateless tests (coverage)": TestConfig( + "package_release_coverage", job_config=JobConfig(**statless_test_common_params) # type: ignore + ), "Stateless tests (aarch64)": TestConfig( "package_aarch64", job_config=JobConfig(**statless_test_common_params) # type: ignore ), From b9f8fff623448e7013bbe604b39d0f72b81032f9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 01:36:48 +0100 Subject: [PATCH 096/884] Fix YAML --- .github/workflows/master.yml | 2 +- .github/workflows/pull_request.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 50d3eb4a062..1920f3a2a56 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -99,7 +99,7 @@ jobs: checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} BuilderDebReleaseCoverage: - needs: [ RunConfig, BuildDockers ] + needs: [RunConfig, BuildDockers] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_build.yml with: diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 7f843f82c01..57199e6b9d9 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -147,7 +147,7 @@ jobs: checkout_depth: 0 data: ${{ needs.RunConfig.outputs.data }} BuilderDebReleaseCoverage: - needs: [ RunConfig, FastTest ] + needs: [RunConfig, FastTest] if: ${{ !failure() && !cancelled() }} uses: ./.github/workflows/reusable_build.yml with: From cc5cc361ef561993bc7bbea6f1588562f7d3deae Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 12:35:49 +0100 Subject: [PATCH 097/884] Fix error --- docker/packager/packager | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/packager/packager b/docker/packager/packager index c310185b071..8efd3b8f302 100755 --- a/docker/packager/packager +++ b/docker/packager/packager @@ -289,7 +289,7 @@ def parse_env_variables( result.append("BUILD_TYPE=None") if coverage: - cmake_flags.append("-DSANITIZE_COVERAGE=1") + cmake_flags.append("-DSANITIZE_COVERAGE=1 -DBUILD_STANDALONE_KEEPER=0") if not cache: cmake_flags.append("-DCOMPILER_CACHE=disabled") From 6c0445f36584a60724f7d616f47c7b953621997c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 12:47:18 +0100 Subject: [PATCH 098/884] Fix CMake --- cmake/sanitize.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/sanitize.cmake b/cmake/sanitize.cmake index 3882b51227e..23e9cc34fec 100644 --- a/cmake/sanitize.cmake +++ b/cmake/sanitize.cmake @@ -63,14 +63,14 @@ endif() option(WITH_COVERAGE "Instrumentation for code coverage with default implementation" OFF) if (WITH_COVERAGE) - message (INFORMATION "Enabled instrumentation for code coverage") + message (STATUS "Enabled instrumentation for code coverage") set(COVERAGE_FLAGS "-fprofile-instr-generate -fcoverage-mapping") endif() option (SANITIZE_COVERAGE "Instrumentation for code coverage with custom callbacks" OFF) if (SANITIZE_COVERAGE) - message (INFORMATION "Enabled instrumentation for code coverage") + message (STATUS "Enabled instrumentation for code coverage") # We set this define for whole build to indicate that at least some parts are compiled with coverage. # And to expose it in system.build_options. From 6d6b8fcf8e988d78fc983ed4043ed556e36b833b Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Thu, 18 Jan 2024 12:58:50 +0100 Subject: [PATCH 099/884] Add missing comments in code --- tests/ci/ci_config.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 1ca4e06bc8c..45bdfbecb0c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -55,6 +55,13 @@ class JobConfig: run_always: bool = False +# About the "sparse_checkout" option: +# +# Misha f. Shiryaev +# :facepalm: +# we have this feature, it's used by devs, we need to test it in CI +# It's not useful for the CI itself + @dataclass class BuildConfig: name: str From db3ffa5c86dba79ca7052abe8d53799ac3e4afb9 Mon Sep 17 00:00:00 2001 From: robot-clickhouse Date: Thu, 18 Jan 2024 12:11:03 +0000 Subject: [PATCH 100/884] Automatic style fix --- tests/ci/ci_config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 45bdfbecb0c..ab37659e65b 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -62,6 +62,7 @@ class JobConfig: # we have this feature, it's used by devs, we need to test it in CI # It's not useful for the CI itself + @dataclass class BuildConfig: name: str From f1749217ee41b3b721fb8a185a929eb18db89b2f Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 18 Jan 2024 21:53:56 +0200 Subject: [PATCH 101/884] added format_schema_rows_template setting --- docs/en/operations/settings/settings-formats.md | 4 ++++ src/Core/Settings.h | 1 + src/Formats/FormatFactory.cpp | 1 + src/Formats/FormatSettings.h | 1 + 4 files changed, 7 insertions(+) diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index eb09af44efd..5dedaa2f6ab 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1668,6 +1668,10 @@ Path to file which contains format string for rows (for Template format). Delimiter between rows (for Template format). +### format_schema_rows_template {#format_schema_rows_template} + +Format string for rows (for Template format) + ## CustomSeparated format settings {custom-separated-format-settings} ### format_custom_escaping_rule {#format_custom_escaping_rule} diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 292e945a29c..4de739ec405 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1079,6 +1079,7 @@ class IColumn; M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ + M(String, format_schema_rows_template, "\n", "Format string for rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ M(String, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 608f9433d6f..6f414c5a69f 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -166,6 +166,7 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; + format_settings.template_settings.row_format_schema_string = settings.format_schema_rows_template; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 30e4dd04513..70d33a1edcd 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -338,6 +338,7 @@ struct FormatSettings String resultset_format; String row_format; String row_between_delimiter; + String row_format_schema_string; } template_settings; struct From c966674c242552584540dc2e28026894c39f9b16 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 03:59:05 +0100 Subject: [PATCH 102/884] Disable LTO with Coverage --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 063cfc77302..6e984ddd864 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -348,7 +348,7 @@ if (COMPILER_CLANG) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fdiagnostics-absolute-paths") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdiagnostics-absolute-paths") - if (NOT ENABLE_TESTS AND NOT SANITIZE AND OS_LINUX) + if (NOT ENABLE_TESTS AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX) # https://clang.llvm.org/docs/ThinLTO.html # Applies to clang and linux only. # Disabled when building with tests or sanitizers. From c6afbe522cae20ee6041534bf7ee7e31e3acb51c Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 05:26:14 +0100 Subject: [PATCH 103/884] Do not check for large translation units with coverage --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e984ddd864..d0f44f6f3ad 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -546,7 +546,7 @@ if (ENABLE_RUST) endif() endif() -if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64)) +if (CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" AND NOT SANITIZE AND NOT SANITIZE_COVERAGE AND OS_LINUX AND (ARCH_AMD64 OR ARCH_AARCH64)) set(CHECK_LARGE_OBJECT_SIZES_DEFAULT ON) else () set(CHECK_LARGE_OBJECT_SIZES_DEFAULT OFF) From 482229cd27c7ddf4218af2ea5d9b087e51876ab0 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 08:04:34 +0100 Subject: [PATCH 104/884] Add tests with coverage --- .github/workflows/master.yml | 16 ++++++++++++++++ .github/workflows/pull_request.yml | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/.github/workflows/master.yml b/.github/workflows/master.yml index 1920f3a2a56..5f683fa6c59 100644 --- a/.github/workflows/master.yml +++ b/.github/workflows/master.yml @@ -371,6 +371,14 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatelessTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateless tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseOrdinary: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -463,6 +471,14 @@ jobs: test_name: Stateful tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatefulTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml index 28617695ad5..235c8042657 100644 --- a/.github/workflows/pull_request.yml +++ b/.github/workflows/pull_request.yml @@ -382,6 +382,14 @@ jobs: test_name: Stateless tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatelessTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateless tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatelessTestReleaseDatabaseReplicated: needs: [RunConfig, BuilderDebRelease] if: ${{ !failure() && !cancelled() }} @@ -509,6 +517,14 @@ jobs: test_name: Stateful tests (release) runner_type: func-tester data: ${{ needs.RunConfig.outputs.data }} + FunctionalStatefulTestCoverage: + needs: [RunConfig, BuilderDebReleaseCoverage] + if: ${{ !failure() && !cancelled() }} + uses: ./.github/workflows/reusable_test.yml + with: + test_name: Stateful tests (coverage) + runner_type: func-tester + data: ${{ needs.RunConfig.outputs.data }} FunctionalStatefulTestAarch64: needs: [RunConfig, BuilderDebAarch64] if: ${{ !failure() && !cancelled() }} From 05609cf75d5048fbd62508fcf6454cec1855943d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 17:02:17 +0100 Subject: [PATCH 105/884] Ci to CI --- tests/ci/ci_config.py | 6 +++--- tests/ci/test_ci_config.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 1d94f4fc1cc..611767be2e4 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -270,9 +270,9 @@ sql_test_params = { @dataclass -class CiConfig: +class CIConfig: """ - Contains configs for ALL jobs in CI pipeline + Contains configs for all jobs in the CI pipeline each config item in the below dicts should be an instance of JobConfig class or inherited from it """ @@ -435,7 +435,7 @@ class CiConfig: raise KeyError("config contains errors", errors) -CI_CONFIG = CiConfig( +CI_CONFIG = CIConfig( label_configs={ Labels.DO_NOT_TEST_LABEL.value: LabelConfig(run_jobs=["Style check"]), }, diff --git a/tests/ci/test_ci_config.py b/tests/ci/test_ci_config.py index d22ed16748e..49d49d9c328 100644 --- a/tests/ci/test_ci_config.py +++ b/tests/ci/test_ci_config.py @@ -3,7 +3,7 @@ import unittest -class TestCiConfig(unittest.TestCase): +class TestCIConfig(unittest.TestCase): def test_no_errors_in_ci_config(self): raised = None try: From 639d7745d450073234405d0725cbd64884d4f8c5 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Fri, 19 Jan 2024 17:02:23 +0100 Subject: [PATCH 106/884] Fix error --- docker/test/base/setup_export_logs.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index 7033d4b52e2..d3721108426 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -181,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/ORDER BY \(?(.+?)\)?/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; + s/^ORDER BY \(?(.+?)\)?$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From 8c54a09e6652b491764abeddf3a0e8e6800374ef Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Jan 2024 08:59:29 +0100 Subject: [PATCH 107/884] Fix error --- docker/test/base/setup_export_logs.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/test/base/setup_export_logs.sh b/docker/test/base/setup_export_logs.sh index d3721108426..156adb1d1e4 100755 --- a/docker/test/base/setup_export_logs.sh +++ b/docker/test/base/setup_export_logs.sh @@ -17,7 +17,7 @@ CLICKHOUSE_CI_LOGS_CLUSTER=${CLICKHOUSE_CI_LOGS_CLUSTER:-system_logs_export} EXTRA_COLUMNS=${EXTRA_COLUMNS:-"pull_request_number UInt32, commit_sha String, check_start_time DateTime('UTC'), check_name LowCardinality(String), instance_type LowCardinality(String), instance_id String, INDEX ix_pr (pull_request_number) TYPE set(100), INDEX ix_commit (commit_sha) TYPE set(100), INDEX ix_check_time (check_start_time) TYPE minmax, "} EXTRA_COLUMNS_EXPRESSION=${EXTRA_COLUMNS_EXPRESSION:-"CAST(0 AS UInt32) AS pull_request_number, '' AS commit_sha, now() AS check_start_time, toLowCardinality('') AS check_name, toLowCardinality('') AS instance_type, '' AS instance_id"} -EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name, "} +EXTRA_ORDER_BY_COLUMNS=${EXTRA_ORDER_BY_COLUMNS:-"check_name"} # trace_log needs more columns for symbolization EXTRA_COLUMNS_TRACE_LOG="${EXTRA_COLUMNS} symbols Array(LowCardinality(String)), lines Array(LowCardinality(String)), " @@ -181,7 +181,7 @@ function setup_logs_replication # Create the destination table with adapted name and structure: statement=$(clickhouse-client --format TSVRaw --query "SHOW CREATE TABLE system.${table}" | sed -r -e ' s/^\($/('"$EXTRA_COLUMNS_FOR_TABLE"'/; - s/^ORDER BY \(?(.+?)\)?$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \1)/; + s/^ORDER BY (([^\(].+?)|\((.+?)\))$/ORDER BY ('"$EXTRA_ORDER_BY_COLUMNS"', \2\3)/; s/^CREATE TABLE system\.\w+_log$/CREATE TABLE IF NOT EXISTS '"$table"'_'"$hash"'/; /^TTL /d ') From cfe60586c007a230df68771b3f914d9a66414b7d Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sat, 20 Jan 2024 21:45:11 +0100 Subject: [PATCH 108/884] Reset coverage after each test --- tests/clickhouse-test | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/clickhouse-test b/tests/clickhouse-test index b62bd5975ea..49c517852a6 100755 --- a/tests/clickhouse-test +++ b/tests/clickhouse-test @@ -2894,7 +2894,7 @@ def parse_args(): parser.add_argument( "--reset-coverage-before-every-test", action="store_true", - default=False, + default=True, help="Collect isolated test coverage for every test instead of a cumulative. Useful only when tests are run sequentially.", ) parser.add_argument( From 51cc01f8be8fea1fcaea0af9c85ca2930536e593 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 21 Jan 2024 14:36:03 +0100 Subject: [PATCH 109/884] Minor change --- base/base/coverage.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index 499e384d21f..05bef21049b 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -62,6 +62,7 @@ namespace uintptr_t * allocate(size_t size) { + /// Note: mmap return zero-initialized memory, and we count on that. void * map = mmap(nullptr, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (MAP_FAILED == map) return nullptr; @@ -91,8 +92,6 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) /// Note: we will leak this. current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); - - resetCoverage(); } /// This is called at least once for every DSO for initialization From b967cc6af9deac20eff318e3433fc5b09fd6314a Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 21 Jan 2024 15:30:50 +0100 Subject: [PATCH 110/884] Fix error --- base/base/coverage.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/base/base/coverage.cpp b/base/base/coverage.cpp index 05bef21049b..b85f1a16d32 100644 --- a/base/base/coverage.cpp +++ b/base/base/coverage.cpp @@ -92,6 +92,8 @@ void __sanitizer_cov_trace_pc_guard_init(uint32_t * start, uint32_t * stop) /// Note: we will leak this. current_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); cumulative_coverage_array = allocate(sizeof(uintptr_t) * coverage_array_size); + + resetCoverage(); } /// This is called at least once for every DSO for initialization @@ -102,8 +104,8 @@ void __sanitizer_cov_pcs_init(const uintptr_t * pcs_begin, const uintptr_t * pcs return; pc_table_initialized = true; - all_addresses_array = allocate(sizeof(uintptr_t) * coverage_array_size); all_addresses_array_size = pcs_end - pcs_begin; + all_addresses_array = allocate(sizeof(uintptr_t) * all_addresses_array_size); /// They are not a real pointers, but also contain a flag in the most significant bit, /// in which we are not interested for now. Reset it. @@ -125,10 +127,10 @@ void __sanitizer_cov_trace_pc_guard(uint32_t * guard) /// The values of `*guard` are as you set them in /// __sanitizer_cov_trace_pc_guard_init and so you can make them consecutive /// and use them to dereference an array or a bit vector. - void * pc = __builtin_return_address(0); + intptr_t pc = reinterpret_cast(__builtin_return_address(0)); - current_coverage_array[guard - guards_start] = reinterpret_cast(pc); - cumulative_coverage_array[guard - guards_start] = reinterpret_cast(pc); + current_coverage_array[guard - guards_start] = pc; + cumulative_coverage_array[guard - guards_start] = pc; } } From 799b8d6356e68c4544791f42a72d71bed38322c5 Mon Sep 17 00:00:00 2001 From: serxa Date: Sun, 21 Jan 2024 19:00:40 +0000 Subject: [PATCH 111/884] support resource request canceling --- docs/en/operations/system-tables/scheduler.md | 4 + src/Common/Scheduler/ISchedulerNode.h | 2 + src/Common/Scheduler/ISchedulerQueue.h | 6 ++ src/Common/Scheduler/Nodes/FairPolicy.h | 98 ++++++++++--------- src/Common/Scheduler/Nodes/FifoQueue.h | 23 ++++- src/Common/Scheduler/Nodes/PriorityPolicy.h | 37 ++++--- .../gtest_resource_manager_hierarchical.cpp | 1 - .../Nodes/tests/gtest_resource_scheduler.cpp | 63 ++++++++++++ src/Common/Scheduler/ResourceGuard.h | 9 +- src/Common/Scheduler/ResourceRequest.cpp | 13 +++ src/Common/Scheduler/ResourceRequest.h | 30 +++--- src/Common/Scheduler/SchedulerRoot.h | 32 +++--- .../System/StorageSystemScheduler.cpp | 4 + 13 files changed, 218 insertions(+), 104 deletions(-) create mode 100644 src/Common/Scheduler/ResourceRequest.cpp diff --git a/docs/en/operations/system-tables/scheduler.md b/docs/en/operations/system-tables/scheduler.md index 953db4c28f2..c4de7f76fdc 100644 --- a/docs/en/operations/system-tables/scheduler.md +++ b/docs/en/operations/system-tables/scheduler.md @@ -26,7 +26,9 @@ priority: 0 is_active: 0 active_children: 0 dequeued_requests: 67 +canceled_requests: 0 dequeued_cost: 4692272 +canceled_cost: 0 busy_periods: 63 vruntime: 938454.1999999989 system_vruntime: ᴺᵁᴸᴸ @@ -54,7 +56,9 @@ Columns: - `is_active` (`UInt8`) - Whether this node is currently active - has resource requests to be dequeued and constraints satisfied. - `active_children` (`UInt64`) - The number of children in active state. - `dequeued_requests` (`UInt64`) - The total number of resource requests dequeued from this node. +- `canceled_requests` (`UInt64`) - The total number of resource requests canceled from this node. - `dequeued_cost` (`UInt64`) - The sum of costs (e.g. size in bytes) of all requests dequeued from this node. +- `canceled_cost` (`UInt64`) - The sum of costs (e.g. size in bytes) of all requests canceled from this node. - `busy_periods` (`UInt64`) - The total number of deactivations of this node. - `vruntime` (`Nullable(Float64)`) - For children of `fair` nodes only. Virtual runtime of a node used by SFQ algorithm to select the next child to process in a max-min fair manner. - `system_vruntime` (`Nullable(Float64)`) - For `fair` nodes only. Virtual runtime showing `vruntime` of the last processed resource request. Used during child activation as the new value of `vruntime`. diff --git a/src/Common/Scheduler/ISchedulerNode.h b/src/Common/Scheduler/ISchedulerNode.h index 804026d7bf4..20c1f4332da 100644 --- a/src/Common/Scheduler/ISchedulerNode.h +++ b/src/Common/Scheduler/ISchedulerNode.h @@ -387,7 +387,9 @@ public: /// Introspection std::atomic dequeued_requests{0}; + std::atomic canceled_requests{0}; std::atomic dequeued_cost{0}; + std::atomic canceled_cost{0}; std::atomic busy_periods{0}; }; diff --git a/src/Common/Scheduler/ISchedulerQueue.h b/src/Common/Scheduler/ISchedulerQueue.h index cbe63bd304a..532f4bf6c63 100644 --- a/src/Common/Scheduler/ISchedulerQueue.h +++ b/src/Common/Scheduler/ISchedulerQueue.h @@ -50,6 +50,12 @@ public: /// Should be called outside of scheduling subsystem, implementation must be thread-safe. virtual void enqueueRequest(ResourceRequest * request) = 0; + /// Cancel previously enqueued request. + /// Returns `false` and does nothing given unknown or already executed request. + /// Returns `true` if requests has been found and canceled. + /// Should be called outside of scheduling subsystem, implementation must be thread-safe. + virtual bool cancelRequest(ResourceRequest * request) = 0; + /// For introspection ResourceCost getBudget() const { diff --git a/src/Common/Scheduler/Nodes/FairPolicy.h b/src/Common/Scheduler/Nodes/FairPolicy.h index c0e187e6fa9..53740e7a543 100644 --- a/src/Common/Scheduler/Nodes/FairPolicy.h +++ b/src/Common/Scheduler/Nodes/FairPolicy.h @@ -134,56 +134,64 @@ public: std::pair dequeueRequest() override { - if (heap_size == 0) - return {nullptr, false}; - - // Recursively pull request from child - auto [request, child_active] = items.front().child->dequeueRequest(); - assert(request != nullptr); - std::pop_heap(items.begin(), items.begin() + heap_size); - Item & current = items[heap_size - 1]; - - // SFQ fairness invariant: system vruntime equals last served request start-time - assert(current.vruntime >= system_vruntime); - system_vruntime = current.vruntime; - - // By definition vruntime is amount of consumed resource (cost) divided by weight - current.vruntime += double(request->cost) / current.child->info.weight; - max_vruntime = std::max(max_vruntime, current.vruntime); - - if (child_active) // Put active child back in heap after vruntime update + while (true) { - std::push_heap(items.begin(), items.begin() + heap_size); - } - else // Deactivate child if it is empty, but remember it's vruntime for latter activations - { - heap_size--; + if (heap_size == 0) + return {nullptr, false}; - // Store index of this inactive child in `parent.idx` - // This enables O(1) search of inactive children instead of O(n) - current.child->info.parent.idx = heap_size; - } + // Recursively pull request from child + auto [request, child_active] = items.front().child->dequeueRequest(); + std::pop_heap(items.begin(), items.begin() + heap_size); + Item & current = items[heap_size - 1]; - // Reset any difference between children on busy period end - if (heap_size == 0) - { - // Reset vtime to zero to avoid floating-point error accumulation, - // but do not reset too often, because it's O(N) - UInt64 ns = clock_gettime_ns(); - if (last_reset_ns + 1000000000 < ns) + if (request) { - last_reset_ns = ns; - for (Item & item : items) - item.vruntime = 0; - max_vruntime = 0; - } - system_vruntime = max_vruntime; - busy_periods++; - } + // SFQ fairness invariant: system vruntime equals last served request start-time + assert(current.vruntime >= system_vruntime); + system_vruntime = current.vruntime; - dequeued_requests++; - dequeued_cost += request->cost; - return {request, heap_size > 0}; + // By definition vruntime is amount of consumed resource (cost) divided by weight + current.vruntime += double(request->cost) / current.child->info.weight; + max_vruntime = std::max(max_vruntime, current.vruntime); + } + + if (child_active) // Put active child back in heap after vruntime update + { + std::push_heap(items.begin(), items.begin() + heap_size); + } + else // Deactivate child if it is empty, but remember it's vruntime for latter activations + { + heap_size--; + + // Store index of this inactive child in `parent.idx` + // This enables O(1) search of inactive children instead of O(n) + current.child->info.parent.idx = heap_size; + } + + // Reset any difference between children on busy period end + if (heap_size == 0) + { + // Reset vtime to zero to avoid floating-point error accumulation, + // but do not reset too often, because it's O(N) + UInt64 ns = clock_gettime_ns(); + if (last_reset_ns + 1000000000 < ns) + { + last_reset_ns = ns; + for (Item & item : items) + item.vruntime = 0; + max_vruntime = 0; + } + system_vruntime = max_vruntime; + busy_periods++; + } + + if (request) + { + dequeued_requests++; + dequeued_cost += request->cost; + return {request, heap_size > 0}; + } + } } bool isActive() override diff --git a/src/Common/Scheduler/Nodes/FifoQueue.h b/src/Common/Scheduler/Nodes/FifoQueue.h index 38ae902bc2f..2adb7241314 100644 --- a/src/Common/Scheduler/Nodes/FifoQueue.h +++ b/src/Common/Scheduler/Nodes/FifoQueue.h @@ -40,7 +40,6 @@ public: void enqueueRequest(ResourceRequest * request) override { std::unique_lock lock(mutex); - request->enqueue_ns = clock_gettime_ns(); queue_cost += request->cost; bool was_empty = requests.empty(); requests.push_back(request); @@ -63,6 +62,26 @@ public: return {result, !requests.empty()}; } + bool cancelRequest(ResourceRequest * request) override + { + std::unique_lock lock(mutex); + // TODO(serxa): reimplement queue as intrusive list of ResourceRequest to make this O(1) instead of O(N) + for (auto i = requests.begin(), e = requests.end(); i != e; ++i) + { + if (*i == request) + { + requests.erase(i); + if (requests.empty()) + busy_periods++; + queue_cost -= request->cost; + canceled_requests++; + canceled_cost += request->cost; + return true; + } + } + return false; + } + bool isActive() override { std::unique_lock lock(mutex); @@ -105,7 +124,7 @@ public: private: std::mutex mutex; Int64 queue_cost = 0; - std::deque requests; + std::deque requests; // TODO(serxa): reimplement it using intrusive list to avoid allocations/deallocations and O(N) during cancel }; } diff --git a/src/Common/Scheduler/Nodes/PriorityPolicy.h b/src/Common/Scheduler/Nodes/PriorityPolicy.h index 6d6b15bd063..fd02ea3df62 100644 --- a/src/Common/Scheduler/Nodes/PriorityPolicy.h +++ b/src/Common/Scheduler/Nodes/PriorityPolicy.h @@ -102,25 +102,30 @@ public: std::pair dequeueRequest() override { - if (items.empty()) - return {nullptr, false}; - - // Recursively pull request from child - auto [request, child_active] = items.front().child->dequeueRequest(); - assert(request != nullptr); - - // Deactivate child if it is empty - if (!child_active) + while (true) { - std::pop_heap(items.begin(), items.end()); - items.pop_back(); if (items.empty()) - busy_periods++; - } + return {nullptr, false}; - dequeued_requests++; - dequeued_cost += request->cost; - return {request, !items.empty()}; + // Recursively pull request from child + auto [request, child_active] = items.front().child->dequeueRequest(); + + // Deactivate child if it is empty + if (!child_active) + { + std::pop_heap(items.begin(), items.end()); + items.pop_back(); + if (items.empty()) + busy_periods++; + } + + if (request) + { + dequeued_requests++; + dequeued_cost += request->cost; + return {request, !items.empty()}; + } + } } bool isActive() override diff --git a/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_hierarchical.cpp b/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_hierarchical.cpp index 961a3b6f713..cdf09776077 100644 --- a/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_hierarchical.cpp +++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_manager_hierarchical.cpp @@ -38,7 +38,6 @@ TEST(SchedulerDynamicResourceManager, Smoke) { ResourceGuard gA(cA->get("res1"), ResourceGuard::PostponeLocking); gA.lock(); - gA.setFailure(); gA.unlock(); ResourceGuard gB(cB->get("res1")); diff --git a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp index 9fefbc02cbd..e76639a4b01 100644 --- a/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp +++ b/src/Common/Scheduler/Nodes/tests/gtest_resource_scheduler.cpp @@ -4,6 +4,7 @@ #include +#include #include using namespace DB; @@ -73,6 +74,22 @@ struct ResourceHolder } }; +struct MyRequest : public ResourceRequest +{ + std::function on_execute; + + explicit MyRequest(ResourceCost cost_, std::function on_execute_) + : ResourceRequest(cost_) + , on_execute(on_execute_) + {} + + void execute() override + { + if (on_execute) + on_execute(); + } +}; + TEST(SchedulerRoot, Smoke) { ResourceTest t; @@ -111,3 +128,49 @@ TEST(SchedulerRoot, Smoke) EXPECT_TRUE(fc2->requests.contains(&rg.request)); } } + +TEST(SchedulerRoot, Cancel) +{ + ResourceTest t; + + ResourceHolder r1(t); + auto * fc1 = r1.add("/", "1"); + r1.add("/prio"); + auto a = r1.addQueue("/prio/A", "1"); + auto b = r1.addQueue("/prio/B", "2"); + r1.registerResource(); + + std::barrier sync(2); + std::thread consumer1([&] + { + std::barrier destruct_sync(2); + MyRequest request(1,[&] + { + sync.arrive_and_wait(); // (A) + EXPECT_TRUE(fc1->requests.contains(&request)); + sync.arrive_and_wait(); // (B) + request.finish(); + destruct_sync.arrive_and_wait(); // (C) + }); + a.queue->enqueueRequest(&request); + destruct_sync.arrive_and_wait(); // (C) + }); + + std::thread consumer2([&] + { + MyRequest request(1,[&] + { + FAIL() << "This request must be canceled, but instead executes"; + }); + sync.arrive_and_wait(); // (A) wait for request of consumer1 to be inside execute, so that constraint is in violated state and our request will not be executed immediately + b.queue->enqueueRequest(&request); + bool canceled = b.queue->cancelRequest(&request); + EXPECT_TRUE(canceled); + sync.arrive_and_wait(); // (B) release request of consumer1 to be finished + }); + + consumer1.join(); + consumer2.join(); + + EXPECT_TRUE(fc1->requests.empty()); +} diff --git a/src/Common/Scheduler/ResourceGuard.h b/src/Common/Scheduler/ResourceGuard.h index dca4041b176..50f665a384b 100644 --- a/src/Common/Scheduler/ResourceGuard.h +++ b/src/Common/Scheduler/ResourceGuard.h @@ -71,8 +71,7 @@ public: // lock(mutex) is not required because `Dequeued` request cannot be used by the scheduler thread chassert(state == Dequeued); state = Finished; - if (constraint) - constraint->finishRequest(this); + ResourceRequest::finish(); } static Request & local() @@ -126,12 +125,6 @@ public: } } - /// Mark request as unsuccessful; by default request is considered to be successful - void setFailure() - { - request.successful = false; - } - ResourceLink link; Request & request; }; diff --git a/src/Common/Scheduler/ResourceRequest.cpp b/src/Common/Scheduler/ResourceRequest.cpp new file mode 100644 index 00000000000..26e8084cdfa --- /dev/null +++ b/src/Common/Scheduler/ResourceRequest.cpp @@ -0,0 +1,13 @@ +#include +#include + +namespace DB +{ + +void ResourceRequest::finish() +{ + if (constraint) + constraint->finishRequest(this); +} + +} diff --git a/src/Common/Scheduler/ResourceRequest.h b/src/Common/Scheduler/ResourceRequest.h index 3d2230746f9..f3153ad382c 100644 --- a/src/Common/Scheduler/ResourceRequest.h +++ b/src/Common/Scheduler/ResourceRequest.h @@ -14,9 +14,6 @@ class ISchedulerConstraint; using ResourceCost = Int64; constexpr ResourceCost ResourceCostMax = std::numeric_limits::max(); -/// Timestamps (nanoseconds since epoch) -using ResourceNs = UInt64; - /* * Request for a resource consumption. The main moving part of the scheduling subsystem. * Resource requests processing workflow: @@ -31,7 +28,7 @@ using ResourceNs = UInt64; * 3) Scheduler calls ISchedulerNode::dequeueRequest() that returns the request. * 4) Callback ResourceRequest::execute() is called to provide access to the resource. * 5) The resource consumption is happening outside of the scheduling subsystem. - * 6) request->constraint->finishRequest() is called when consumption is finished. + * 6) ResourceRequest::finish() is called when consumption is finished. * * Steps (5) and (6) can be omitted if constraint is not used by the resource. * @@ -39,7 +36,10 @@ using ResourceNs = UInt64; * Request ownership is done outside of the scheduling subsystem. * After (6) request can be destructed safely. * - * Request cancelling is not supported yet. + * Request can also be canceled before (3) using ISchedulerQueue::cancelRequest(). + * Returning false means it is too late for request to be canceled. It should be processed in a regular way. + * Returning true means successful cancel and therefore steps (4) and (5) are not going to happen + * and step (6) MUST be omitted. */ class ResourceRequest { @@ -48,32 +48,20 @@ public: /// NOTE: If cost is not known in advance, ResourceBudget should be used (note that every ISchedulerQueue has it) ResourceCost cost; - /// Request outcome - /// Should be filled during resource consumption - bool successful; - /// Scheduler node to be notified on consumption finish /// Auto-filled during request enqueue/dequeue ISchedulerConstraint * constraint; - /// Timestamps for introspection - ResourceNs enqueue_ns; - ResourceNs execute_ns; - ResourceNs finish_ns; - explicit ResourceRequest(ResourceCost cost_ = 1) { reset(cost_); } + /// ResourceRequest object may be reused again after reset() void reset(ResourceCost cost_) { cost = cost_; - successful = true; constraint = nullptr; - enqueue_ns = 0; - execute_ns = 0; - finish_ns = 0; } virtual ~ResourceRequest() = default; @@ -83,6 +71,12 @@ public: /// just triggering start of a consumption, not doing the consumption itself /// (e.g. setting an std::promise or creating a job in a thread pool) virtual void execute() = 0; + + /// Stop resource consumption and notify resource scheduler. + /// Should be called when resource consumption is finished by consumer. + /// ResourceRequest should not be destructed or reset before calling to `finish()`. + /// WARNING: this function MUST not be called if request was canceled. + void finish(); }; } diff --git a/src/Common/Scheduler/SchedulerRoot.h b/src/Common/Scheduler/SchedulerRoot.h index 3a23a8df834..ab3f702a422 100644 --- a/src/Common/Scheduler/SchedulerRoot.h +++ b/src/Common/Scheduler/SchedulerRoot.h @@ -145,22 +145,27 @@ public: std::pair dequeueRequest() override { - if (current == nullptr) // No active resources - return {nullptr, false}; + while (true) + { + if (current == nullptr) // No active resources + return {nullptr, false}; - // Dequeue request from current resource - auto [request, resource_active] = current->root->dequeueRequest(); - assert(request != nullptr); + // Dequeue request from current resource + auto [request, resource_active] = current->root->dequeueRequest(); - // Deactivate resource if required - if (!resource_active) - deactivate(current); - else - current = current->next; // Just move round-robin pointer + // Deactivate resource if required + if (!resource_active) + deactivate(current); + else + current = current->next; // Just move round-robin pointer - dequeued_requests++; - dequeued_cost += request->cost; - return {request, current != nullptr}; + if (request == nullptr) // Possible in case of request cancel, just retry + continue; + + dequeued_requests++; + dequeued_cost += request->cost; + return {request, current != nullptr}; + } } bool isActive() override @@ -245,7 +250,6 @@ private: void execute(ResourceRequest * request) { - request->execute_ns = clock_gettime_ns(); request->execute(); } diff --git a/src/Storages/System/StorageSystemScheduler.cpp b/src/Storages/System/StorageSystemScheduler.cpp index ba07d44dbf9..633bac5d285 100644 --- a/src/Storages/System/StorageSystemScheduler.cpp +++ b/src/Storages/System/StorageSystemScheduler.cpp @@ -30,7 +30,9 @@ ColumnsDescription StorageSystemScheduler::getColumnsDescription() {"is_active", std::make_shared(), "Whether this node is currently active - has resource requests to be dequeued and constraints satisfied."}, {"active_children", std::make_shared(), "The number of children in active state."}, {"dequeued_requests", std::make_shared(), "The total number of resource requests dequeued from this node."}, + {"canceled_requests", std::make_shared(), "The total number of resource requests canceled from this node."}, {"dequeued_cost", std::make_shared(), "The sum of costs (e.g. size in bytes) of all requests dequeued from this node."}, + {"canceled_cost", std::make_shared(), "The sum of costs (e.g. size in bytes) of all requests canceled from this node."}, {"busy_periods", std::make_shared(), "The total number of deactivations of this node."}, {"vruntime", std::make_shared(std::make_shared()), "For children of `fair` nodes only. Virtual runtime of a node used by SFQ algorithm to select the next child to process in a max-min fair manner."}, @@ -93,7 +95,9 @@ void StorageSystemScheduler::fillData(MutableColumns & res_columns, ContextPtr c res_columns[i++]->insert(node->isActive()); res_columns[i++]->insert(node->activeChildren()); res_columns[i++]->insert(node->dequeued_requests.load()); + res_columns[i++]->insert(node->canceled_requests.load()); res_columns[i++]->insert(node->dequeued_cost.load()); + res_columns[i++]->insert(node->canceled_cost.load()); res_columns[i++]->insert(node->busy_periods.load()); Field vruntime; From eae39ff545978386a8a57bca7c68b1ff97cf6d6d Mon Sep 17 00:00:00 2001 From: Blargian Date: Sun, 21 Jan 2024 21:51:06 +0200 Subject: [PATCH 112/884] #31363 - modified TemplateBlockOutputFormat to work with added format_schema_rows_template setting --- src/Core/Settings.h | 2 +- src/Formats/FormatFactory.cpp | 3 +- src/Formats/FormatSettings.h | 2 +- .../Impl/TemplateBlockOutputFormat.cpp | 33 +++++++++++++++---- 4 files changed, 31 insertions(+), 9 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 4de739ec405..3143ada7d65 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1078,8 +1078,8 @@ class IColumn; M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ - M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ M(String, format_schema_rows_template, "\n", "Format string for rows (for Template format)", 0) \ + M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ M(String, format_custom_field_delimiter, "\t", "Delimiter between fields (for CustomSeparated format)", 0) \ diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 6f414c5a69f..6f7f758621c 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -166,7 +166,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; - format_settings.template_settings.row_format_schema_string = settings.format_schema_rows_template; + format_settings.template_settings.row_format_schema = settings.format_schema_rows_template; + format_settings.template_settings.row_between_delimiter_schema = settings.format_schema_rows_between_delimiter; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 70d33a1edcd..28a2076af84 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -338,7 +338,7 @@ struct FormatSettings String resultset_format; String row_format; String row_between_delimiter; - String row_format_schema_string; + String row_format_schema; } template_settings; struct diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 6d8fe1e5a2c..495cc0e541e 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -11,6 +11,7 @@ namespace DB namespace ErrorCodes { extern const int SYNTAX_ERROR; + extern const int INVALID_TEMPLATE_FORMAT; } TemplateBlockOutputFormat::TemplateBlockOutputFormat(const Block & header_, WriteBuffer & out_, const FormatSettings & settings_, @@ -213,14 +214,34 @@ void registerOutputFormatTemplate(FormatFactory & factory) }); } - ParsedTemplateFormatString row_format = ParsedTemplateFormatString( + ParsedTemplateFormatString row_format; + auto idx_by_name = [&](const String & colName) + { + return sample.getPositionByName(colName); + }; + if (settings.template_settings.row_format.empty()) + { + if (settings.template_settings.row_format_schema.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); + } + else + { + row_format = ParsedTemplateFormatString(); + row_format.parse(settings.template_settings.row_format_schema,idx_by_name); + } + } + else + { + if (settings.template_settings.row_format_schema.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); + } + row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & colName) - { - return sample.getPositionByName(colName); - }); - + idx_by_name); + } return std::make_shared(sample, buf, settings, resultset_format, row_format, settings.template_settings.row_between_delimiter); }); From d2c671c17eb4a85583b30d81033f7180ea93f627 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Fri, 19 Jan 2024 20:38:08 +0000 Subject: [PATCH 113/884] 1st attempt at vectorization 80 mio arrays with 150 elements each, runtimes in sec WITH (SELECT vec FROM vectors limit 1) AS const_vec SELECT sum(dist) FROM (SELECT (const_vec, vec) AS dist FROM vectors) auto-vectorized hand-vectorized L2 Float32 0.61 0.57 L2 Float64 1.15 0.99 cos Float32 0.78 0.65 cos Float64 1.35 1.05 --- src/Functions/array/arrayDistance.cpp | 145 +++++++++++++++++- .../02282_array_distance.reference | 4 + .../0_stateless/02282_array_distance.sql | 40 +++-- 3 files changed, 172 insertions(+), 17 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index c68c89ee0d5..670442c0c79 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -9,6 +10,10 @@ #include #include +#if USE_MULTITARGET_CODE +#include +#endif + namespace DB { namespace ErrorCodes @@ -75,6 +80,49 @@ struct L2Distance state.sum += other_state.sum; } +#if USE_MULTITARGET_CODE + template + AVX512_FUNCTION_SPECIFIC_ATTRIBUTE static void accumulateCombine( + const ResultType * __restrict data_x, + const ResultType * __restrict data_y, + size_t i_max, + size_t & i_x, + size_t & i_y, + State & state) + { + __m512 sums; + if constexpr (std::is_same_v) + sums = _mm512_setzero_ps(); + else + sums = _mm512_setzero_pd(); + + const size_t n = (std::is_same_v) ? 16 : 8; + + for (; i_x + n < i_max; i_x += n, i_y += n) + { + if constexpr (std::is_same_v) + { + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + __m512 differences = _mm512_sub_ps(x, y); + sums = _mm512_fmadd_ps(differences, differences, sums); + } + else + { + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + __m512 differences = _mm512_sub_pd(x, y); + sums = _mm512_fmadd_pd(differences, differences, sums); + } + } + + if constexpr (std::is_same_v) + state.sum = _mm512_reduce_add_ps(sums); + else + state.sum = _mm512_reduce_add_pd(sums); + } +#endif + template static ResultType finalize(const State & state, const ConstParams &) { @@ -189,6 +237,70 @@ struct CosineDistance state.y_squared += other_state.y_squared; } +#if USE_MULTITARGET_CODE + template + AVX512_FUNCTION_SPECIFIC_ATTRIBUTE static void accumulateCombine( + const ResultType * __restrict data_x, + const ResultType * __restrict data_y, + size_t i_max, + size_t & i_x, + size_t & i_y, + State & state) + { + __m512 dot_products; + __m512 x_squareds; + __m512 y_squareds; + + if constexpr (std::is_same_v) + { + dot_products = _mm512_setzero_ps(); + x_squareds = _mm512_setzero_ps(); + y_squareds = _mm512_setzero_ps(); + } + else + { + dot_products = _mm512_setzero_pd(); + x_squareds = _mm512_setzero_pd(); + y_squareds = _mm512_setzero_pd(); + } + + const size_t n = (std::is_same_v) ? 16 : 8; + + for (; i_x + n < i_max; i_x += n, i_y += n) + { + if constexpr (std::is_same_v) + { + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + dot_products = _mm512_fmadd_ps(x, y, dot_products); + x_squareds = _mm512_fmadd_ps(x, x, x_squareds); + y_squareds = _mm512_fmadd_ps(y, y, y_squareds); + } + else + { + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + dot_products = _mm512_fmadd_pd(x, y, dot_products); + x_squareds = _mm512_fmadd_pd(x, x, x_squareds); + y_squareds = _mm512_fmadd_pd(y, y, y_squareds); + } + } + + if constexpr (std::is_same_v) + { + state.dot_prod = _mm512_reduce_add_ps(dot_products); + state.x_squared = _mm512_reduce_add_ps(x_squareds); + state.y_squared = _mm512_reduce_add_ps(y_squareds); + } + else + { + state.dot_prod = _mm512_reduce_add_pd(dot_products); + state.x_squared = _mm512_reduce_add_pd(x_squareds); + state.y_squared = _mm512_reduce_add_pd(y_squareds); + } + } +#endif + template static ResultType finalize(const State & state, const ConstParams &) { @@ -352,7 +464,7 @@ private: /// Check that arrays in both columns are the sames size for (size_t row = 0; row < offsets_x.size(); ++row) { - if (unlikely(offsets_x[row] != offsets_y[row])) + if (offsets_x[row] != offsets_y[row]) [[unlikely]] { ColumnArray::Offset prev_offset = row > 0 ? offsets_x[row] : 0; throw Exception( @@ -420,7 +532,7 @@ private: ColumnArray::Offset prev_offset = 0; for (size_t row : collections::range(0, offsets_y.size())) { - if (unlikely(offsets_x[0] != offsets_y[row] - prev_offset)) + if (offsets_x[0] != offsets_y[row] - prev_offset) [[unlikely]] { throw Exception( ErrorCodes::SIZES_OF_ARRAYS_DONT_MATCH, @@ -438,14 +550,35 @@ private: auto & result_data = result->getData(); /// Do the actual computation - ColumnArray::Offset prev = 0; + size_t prev = 0; size_t row = 0; + for (auto off : offsets_y) { + size_t i = 0; + typename Kernel::template State state; + + /// SIMD optimization: process multiple elements in both input arrays at once. + /// To avoid combinatorial explosion of SIMD kernels, focus on + /// - the two most common input/output types (Float32 x Float32) --> Float32 and (Float64 x Float64) --> Float64 instead of 10 x + /// 10 input types x 2 output types, + /// - const/non-const inputs instead of non-const/non-const inputs + /// - the two most common metrics L2 and cosine distance, + /// - the most powerful SIMD instruction set (AVX-512F). +#if USE_MULTITARGET_CODE + if constexpr (std::is_same_v && std::is_same_v) /// ResultType is Float32 or Float64 + { + if constexpr (std::is_same_v + || std::is_same_v) + { + if (isArchSupported(TargetArch::AVX512F)) + Kernel::template accumulateCombine(data_x.data(), data_y.data(), i + offsets_x[0], i, prev, state); + } + } +#else /// Process chunks in vectorized manner static constexpr size_t VEC_SIZE = 4; typename Kernel::template State states[VEC_SIZE]; - size_t i = 0; for (; prev + VEC_SIZE < off; i += VEC_SIZE, prev += VEC_SIZE) { for (size_t s = 0; s < VEC_SIZE; ++s) @@ -453,10 +586,9 @@ private: states[s], static_cast(data_x[i + s]), static_cast(data_y[prev + s]), kernel_params); } - typename Kernel::template State state; for (const auto & other_state : states) Kernel::template combine(state, other_state, kernel_params); - +#endif /// Process the tail for (; prev < off; ++i, ++prev) { @@ -466,6 +598,7 @@ private: result_data[row] = Kernel::finalize(state, kernel_params); row++; } + return result; } diff --git a/tests/queries/0_stateless/02282_array_distance.reference b/tests/queries/0_stateless/02282_array_distance.reference index 9758da9a833..c21e294cb62 100644 --- a/tests/queries/0_stateless/02282_array_distance.reference +++ b/tests/queries/0_stateless/02282_array_distance.reference @@ -80,3 +80,7 @@ nan 5 6 268 2 10.234459893824097 23.15167380558045 536 0.00007815428961455151 6 5 268 2 10.234459893824097 23.15167380558045 536 0.00007815428961455151 6 6 0 0 0 0 0 0 +5.8309517 +0.0003244877 +5.830951894845301 +0.0003245172890904424 diff --git a/tests/queries/0_stateless/02282_array_distance.sql b/tests/queries/0_stateless/02282_array_distance.sql index 9c16071dc1f..2cca853fd67 100644 --- a/tests/queries/0_stateless/02282_array_distance.sql +++ b/tests/queries/0_stateless/02282_array_distance.sql @@ -12,10 +12,10 @@ SELECT cosineDistance([1, 2, 3], [0, 0, 0]); -- Overflows WITH CAST([-547274980, 1790553898, 1981517754, 1908431500, 1352428565, -573412550, -552499284, 2096941042], 'Array(Int32)') AS a SELECT - L1Distance(a,a), - L2Distance(a,a), - L2SquaredDistance(a,a), - LinfDistance(a,a), + L1Distance(a, a), + L2Distance(a, a), + L2SquaredDistance(a, a), + LinfDistance(a, a), cosineDistance(a, a); DROP TABLE IF EXISTS vec1; @@ -88,15 +88,33 @@ SELECT FROM vec2f v1, vec2d v2 WHERE length(v1.v) == length(v2.v); -SELECT L1Distance([0, 0], [1]); -- { serverError 190 } -SELECT L2Distance([1, 2], (3,4)); -- { serverError 43 } -SELECT L2SquaredDistance([1, 2], (3,4)); -- { serverError 43 } -SELECT LpDistance([1, 2], [3,4]); -- { serverError 42 } -SELECT LpDistance([1, 2], [3,4], -1.); -- { serverError 69 } -SELECT LpDistance([1, 2], [3,4], 'aaa'); -- { serverError 43 } -SELECT LpDistance([1, 2], [3,4], materialize(2.7)); -- { serverError 44 } +SELECT L1Distance([0, 0], [1]); -- { serverError SIZES_OF_ARRAYS_DONT_MATCH } +SELECT L2Distance([1, 2], (3,4)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT L2SquaredDistance([1, 2], (3,4)); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT LpDistance([1, 2], [3,4]); -- { serverError NUMBER_OF_ARGUMENTS_DOESNT_MATCH } +SELECT LpDistance([1, 2], [3,4], -1.); -- { serverError ARGUMENT_OUT_OF_BOUND } +SELECT LpDistance([1, 2], [3,4], 'aaa'); -- { serverError ILLEGAL_TYPE_OF_ARGUMENT } +SELECT LpDistance([1, 2], [3,4], materialize(2.7)); -- { serverError ILLEGAL_COLUMN } DROP TABLE vec1; DROP TABLE vec2; DROP TABLE vec2f; DROP TABLE vec2d; + +-- Queries which trigger manually vectorized implementation + +SELECT L2Distance( + [toFloat32(0.0), toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0)], + materialize([toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0), toFloat32(34.0)])); + +SELECT cosineDistance( + [toFloat32(0.0), toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0)], + materialize([toFloat32(1.0), toFloat32(2.0), toFloat32(3.0), toFloat32(4.0), toFloat32(5.0), toFloat32(6.0), toFloat32(7.0), toFloat32(8.0), toFloat32(9.0), toFloat32(10.0), toFloat32(11.0), toFloat32(12.0), toFloat32(13.0), toFloat32(14.0), toFloat32(15.0), toFloat32(16.0), toFloat32(17.0), toFloat32(18.0), toFloat32(19.0), toFloat32(20.0), toFloat32(21.0), toFloat32(22.0), toFloat32(23.0), toFloat32(24.0), toFloat32(25.0), toFloat32(26.0), toFloat32(27.0), toFloat32(28.0), toFloat32(29.0), toFloat32(30.0), toFloat32(31.0), toFloat32(32.0), toFloat32(33.0), toFloat32(34.0)])); + +SELECT L2Distance( + [toFloat64(0.0), toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0)], + materialize([toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0), toFloat64(34.0)])); + +SELECT cosineDistance( + [toFloat64(0.0), toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0)], + materialize([toFloat64(1.0), toFloat64(2.0), toFloat64(3.0), toFloat64(4.0), toFloat64(5.0), toFloat64(6.0), toFloat64(7.0), toFloat64(8.0), toFloat64(9.0), toFloat64(10.0), toFloat64(11.0), toFloat64(12.0), toFloat64(13.0), toFloat64(14.0), toFloat64(15.0), toFloat64(16.0), toFloat64(17.0), toFloat64(18.0), toFloat64(19.0), toFloat64(20.0), toFloat64(21.0), toFloat64(22.0), toFloat64(23.0), toFloat64(24.0), toFloat64(25.0), toFloat64(26.0), toFloat64(27.0), toFloat64(28.0), toFloat64(29.0), toFloat64(30.0), toFloat64(31.0), toFloat64(32.0), toFloat64(33.0), toFloat64(34.0)])); From 68d0f4e42161713f3b54de2069d894b1f84ed833 Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 20 Jan 2024 21:36:25 +0000 Subject: [PATCH 114/884] (Futile) unrolling attempt at vectorization --- src/Functions/array/arrayDistance.cpp | 88 ++++++++++++++++++++++----- 1 file changed, 72 insertions(+), 16 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index 670442c0c79..aa13ee01d9a 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -90,36 +90,92 @@ struct L2Distance size_t & i_y, State & state) { - __m512 sums; - if constexpr (std::is_same_v) - sums = _mm512_setzero_ps(); - else - sums = _mm512_setzero_pd(); + __m512 sums1; + __m512 sums2; + __m512 sums3; + __m512 sums4; - const size_t n = (std::is_same_v) ? 16 : 8; + if constexpr (std::is_same_v) + { + sums1 = _mm512_setzero_ps(); + sums2 = _mm512_setzero_ps(); + sums3 = _mm512_setzero_ps(); + sums4 = _mm512_setzero_ps(); + } + else + { + sums1 = _mm512_setzero_pd(); + sums2 = _mm512_setzero_pd(); + sums3 = _mm512_setzero_pd(); + sums4 = _mm512_setzero_pd(); + } + + const size_t n = (std::is_same_v) ? 64 : 32; for (; i_x + n < i_max; i_x += n, i_y += n) { if constexpr (std::is_same_v) { - __m512 x = _mm512_loadu_ps(data_x + i_x); - __m512 y = _mm512_loadu_ps(data_y + i_y); - __m512 differences = _mm512_sub_ps(x, y); - sums = _mm512_fmadd_ps(differences, differences, sums); + __m512 x1 = _mm512_loadu_ps(data_x + i_x); + __m512 y1 = _mm512_loadu_ps(data_y + i_y); + __m512 diff1 = _mm512_sub_ps(x1, y1); + sums1 = _mm512_fmadd_ps(diff1, diff1, sums1); + + __m512 x2 = _mm512_loadu_ps(data_x + i_x + 16); + __m512 y2 = _mm512_loadu_ps(data_y + i_y + 16); + __m512 diff2 = _mm512_sub_ps(x2, y2); + sums2 = _mm512_fmadd_ps(diff2, diff2, sums2); + + __m512 x3 = _mm512_loadu_ps(data_x + i_x + 32); + __m512 y3 = _mm512_loadu_ps(data_y + i_y + 32); + __m512 diff3 = _mm512_sub_ps(x3, y3); + sums3 = _mm512_fmadd_ps(diff3, diff3, sums3); + + __m512 x4 = _mm512_loadu_ps(data_x + i_x + 48); + __m512 y4 = _mm512_loadu_ps(data_y + i_y + 48); + __m512 diff4 = _mm512_sub_ps(x4, y4); + sums4 = _mm512_fmadd_ps(diff4, diff4, sums4); } else { - __m512 x = _mm512_loadu_pd(data_x + i_x); - __m512 y = _mm512_loadu_pd(data_y + i_y); - __m512 differences = _mm512_sub_pd(x, y); - sums = _mm512_fmadd_pd(differences, differences, sums); + __m512 x1 = _mm512_loadu_pd(data_x + i_x); + __m512 y1 = _mm512_loadu_pd(data_y + i_y); + __m512 diff1 = _mm512_sub_pd(x1, y1); + sums1 = _mm512_fmadd_pd(diff1, diff1, sums1); + + __m512 x2 = _mm512_loadu_pd(data_x + i_x + 8); + __m512 y2 = _mm512_loadu_pd(data_y + i_y + 8); + __m512 diff2 = _mm512_sub_pd(x2, y2); + sums2 = _mm512_fmadd_pd(diff2, diff2, sums2); + + __m512 x3 = _mm512_loadu_pd(data_x + i_x + 16); + __m512 y3 = _mm512_loadu_pd(data_y + i_y + 16); + __m512 diff3 = _mm512_sub_pd(x3, y3); + sums3 = _mm512_fmadd_pd(diff3, diff3, sums3); + + __m512 x4 = _mm512_loadu_pd(data_x + i_x + 24); + __m512 y4 = _mm512_loadu_pd(data_y + i_y + 24); + __m512 diff4 = _mm512_sub_pd(x4, y4); + sums4 = _mm512_fmadd_pd(diff4, diff4, sums4); } } if constexpr (std::is_same_v) - state.sum = _mm512_reduce_add_ps(sums); + { + Float32 sum1 = _mm512_reduce_add_ps(sums1); + Float32 sum2 = _mm512_reduce_add_ps(sums2); + Float32 sum3 = _mm512_reduce_add_ps(sums3); + Float32 sum4 = _mm512_reduce_add_ps(sums4); + state.sum = sum1 + sum2 + sum3 + sum4; + } else - state.sum = _mm512_reduce_add_pd(sums); + { + Float64 sum1 = _mm512_reduce_add_pd(sums1); + Float64 sum2 = _mm512_reduce_add_pd(sums2); + Float64 sum3 = _mm512_reduce_add_pd(sums3); + Float64 sum4 = _mm512_reduce_add_pd(sums4); + state.sum = sum1 + sum2 + sum3 + sum4; + } } #endif From 68fc97089ec22d29b5d25df4e3865a22cf9701db Mon Sep 17 00:00:00 2001 From: Robert Schulze Date: Sat, 20 Jan 2024 21:50:13 +0000 Subject: [PATCH 115/884] Revert "(Futile) unrolling attempt at vectorization" This reverts commit df30a990545eafdf5e6a09034d81a97fb0188ba0. --- src/Functions/array/arrayDistance.cpp | 84 +++++---------------------- 1 file changed, 14 insertions(+), 70 deletions(-) diff --git a/src/Functions/array/arrayDistance.cpp b/src/Functions/array/arrayDistance.cpp index aa13ee01d9a..670442c0c79 100644 --- a/src/Functions/array/arrayDistance.cpp +++ b/src/Functions/array/arrayDistance.cpp @@ -90,92 +90,36 @@ struct L2Distance size_t & i_y, State & state) { - __m512 sums1; - __m512 sums2; - __m512 sums3; - __m512 sums4; - + __m512 sums; if constexpr (std::is_same_v) - { - sums1 = _mm512_setzero_ps(); - sums2 = _mm512_setzero_ps(); - sums3 = _mm512_setzero_ps(); - sums4 = _mm512_setzero_ps(); - } + sums = _mm512_setzero_ps(); else - { - sums1 = _mm512_setzero_pd(); - sums2 = _mm512_setzero_pd(); - sums3 = _mm512_setzero_pd(); - sums4 = _mm512_setzero_pd(); - } + sums = _mm512_setzero_pd(); - const size_t n = (std::is_same_v) ? 64 : 32; + const size_t n = (std::is_same_v) ? 16 : 8; for (; i_x + n < i_max; i_x += n, i_y += n) { if constexpr (std::is_same_v) { - __m512 x1 = _mm512_loadu_ps(data_x + i_x); - __m512 y1 = _mm512_loadu_ps(data_y + i_y); - __m512 diff1 = _mm512_sub_ps(x1, y1); - sums1 = _mm512_fmadd_ps(diff1, diff1, sums1); - - __m512 x2 = _mm512_loadu_ps(data_x + i_x + 16); - __m512 y2 = _mm512_loadu_ps(data_y + i_y + 16); - __m512 diff2 = _mm512_sub_ps(x2, y2); - sums2 = _mm512_fmadd_ps(diff2, diff2, sums2); - - __m512 x3 = _mm512_loadu_ps(data_x + i_x + 32); - __m512 y3 = _mm512_loadu_ps(data_y + i_y + 32); - __m512 diff3 = _mm512_sub_ps(x3, y3); - sums3 = _mm512_fmadd_ps(diff3, diff3, sums3); - - __m512 x4 = _mm512_loadu_ps(data_x + i_x + 48); - __m512 y4 = _mm512_loadu_ps(data_y + i_y + 48); - __m512 diff4 = _mm512_sub_ps(x4, y4); - sums4 = _mm512_fmadd_ps(diff4, diff4, sums4); + __m512 x = _mm512_loadu_ps(data_x + i_x); + __m512 y = _mm512_loadu_ps(data_y + i_y); + __m512 differences = _mm512_sub_ps(x, y); + sums = _mm512_fmadd_ps(differences, differences, sums); } else { - __m512 x1 = _mm512_loadu_pd(data_x + i_x); - __m512 y1 = _mm512_loadu_pd(data_y + i_y); - __m512 diff1 = _mm512_sub_pd(x1, y1); - sums1 = _mm512_fmadd_pd(diff1, diff1, sums1); - - __m512 x2 = _mm512_loadu_pd(data_x + i_x + 8); - __m512 y2 = _mm512_loadu_pd(data_y + i_y + 8); - __m512 diff2 = _mm512_sub_pd(x2, y2); - sums2 = _mm512_fmadd_pd(diff2, diff2, sums2); - - __m512 x3 = _mm512_loadu_pd(data_x + i_x + 16); - __m512 y3 = _mm512_loadu_pd(data_y + i_y + 16); - __m512 diff3 = _mm512_sub_pd(x3, y3); - sums3 = _mm512_fmadd_pd(diff3, diff3, sums3); - - __m512 x4 = _mm512_loadu_pd(data_x + i_x + 24); - __m512 y4 = _mm512_loadu_pd(data_y + i_y + 24); - __m512 diff4 = _mm512_sub_pd(x4, y4); - sums4 = _mm512_fmadd_pd(diff4, diff4, sums4); + __m512 x = _mm512_loadu_pd(data_x + i_x); + __m512 y = _mm512_loadu_pd(data_y + i_y); + __m512 differences = _mm512_sub_pd(x, y); + sums = _mm512_fmadd_pd(differences, differences, sums); } } if constexpr (std::is_same_v) - { - Float32 sum1 = _mm512_reduce_add_ps(sums1); - Float32 sum2 = _mm512_reduce_add_ps(sums2); - Float32 sum3 = _mm512_reduce_add_ps(sums3); - Float32 sum4 = _mm512_reduce_add_ps(sums4); - state.sum = sum1 + sum2 + sum3 + sum4; - } + state.sum = _mm512_reduce_add_ps(sums); else - { - Float64 sum1 = _mm512_reduce_add_pd(sums1); - Float64 sum2 = _mm512_reduce_add_pd(sums2); - Float64 sum3 = _mm512_reduce_add_pd(sums3); - Float64 sum4 = _mm512_reduce_add_pd(sums4); - state.sum = sum1 + sum2 + sum3 + sum4; - } + state.sum = _mm512_reduce_add_pd(sums); } #endif From df0c018a9be06e9ccbfb40460f29b155aa86b57f Mon Sep 17 00:00:00 2001 From: Hongbin Ma Date: Fri, 12 Jan 2024 16:09:09 +0800 Subject: [PATCH 116/884] support T64 for date32 type --- src/Compression/CompressionCodecT64.cpp | 6 +++++ .../00873_t64_codec_date.reference | 4 +++ .../0_stateless/00873_t64_codec_date.sql | 26 +++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 tests/queries/0_stateless/00873_t64_codec_date.reference create mode 100644 tests/queries/0_stateless/00873_t64_codec_date.sql diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp index bf9a9414bc1..42c6a18aa77 100644 --- a/src/Compression/CompressionCodecT64.cpp +++ b/src/Compression/CompressionCodecT64.cpp @@ -91,6 +91,7 @@ enum class MagicNumber : uint8_t Decimal32 = 19, Decimal64 = 20, IPv4 = 21, + Date32 = 22, }; MagicNumber serializeTypeId(std::optional type_id) @@ -109,6 +110,7 @@ MagicNumber serializeTypeId(std::optional type_id) case TypeIndex::Int32: return MagicNumber::Int32; case TypeIndex::Int64: return MagicNumber::Int64; case TypeIndex::Date: return MagicNumber::Date; + case TypeIndex::Date32: return MagicNumber::Date32; case TypeIndex::DateTime: return MagicNumber::DateTime; case TypeIndex::DateTime64: return MagicNumber::DateTime64; case TypeIndex::Enum8: return MagicNumber::Enum8; @@ -137,6 +139,7 @@ TypeIndex deserializeTypeId(uint8_t serialized_type_id) case MagicNumber::Int32: return TypeIndex::Int32; case MagicNumber::Int64: return TypeIndex::Int64; case MagicNumber::Date: return TypeIndex::Date; + case MagicNumber::Date32: return TypeIndex::Date32; case MagicNumber::DateTime: return TypeIndex::DateTime; case MagicNumber::DateTime64: return TypeIndex::DateTime64; case MagicNumber::Enum8: return TypeIndex::Enum8; @@ -177,6 +180,8 @@ TypeIndex baseType(TypeIndex type_idx) case TypeIndex::Enum16: case TypeIndex::Date: return TypeIndex::UInt16; + case TypeIndex::Date32: + return TypeIndex::Int32; case TypeIndex::UInt32: case TypeIndex::DateTime: case TypeIndex::IPv4: @@ -205,6 +210,7 @@ TypeIndex typeIdx(const IDataType * data_type) case TypeIndex::UInt16: case TypeIndex::Enum16: case TypeIndex::Date: + case TypeIndex::Date32: case TypeIndex::Int32: case TypeIndex::UInt32: case TypeIndex::IPv4: diff --git a/tests/queries/0_stateless/00873_t64_codec_date.reference b/tests/queries/0_stateless/00873_t64_codec_date.reference new file mode 100644 index 00000000000..1568c3122e6 --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.reference @@ -0,0 +1,4 @@ +1970-01-01 1970-01-01 1950-01-01 1950-01-01 +1970-01-01 1970-01-01 1970-01-01 1970-01-01 +2149-06-06 2149-06-06 2149-06-08 2149-06-08 +2149-06-06 2149-06-06 2149-06-06 2149-06-06 diff --git a/tests/queries/0_stateless/00873_t64_codec_date.sql b/tests/queries/0_stateless/00873_t64_codec_date.sql new file mode 100644 index 00000000000..e9230c75665 --- /dev/null +++ b/tests/queries/0_stateless/00873_t64_codec_date.sql @@ -0,0 +1,26 @@ +DROP TABLE IF EXISTS t64; + +CREATE TABLE t64 +( + date16 Date, + t_date16 Date Codec(T64, ZSTD), + date_32 Date32, + t_date32 Date32 Codec(T64, ZSTD) +) ENGINE MergeTree() ORDER BY tuple(); + +INSERT INTO t64 values ('1970-01-01', '1970-01-01', '1970-01-01', '1970-01-01'); +INSERT INTO t64 values ('2149-06-06', '2149-06-06', '2149-06-06', '2149-06-06'); +INSERT INTO t64 values ('2149-06-08', '2149-06-08', '2149-06-08', '2149-06-08'); +INSERT INTO t64 values ('1950-01-01', '1950-01-01', '1950-01-01', '1950-01-01'); + +SELECT * FROM t64 ORDER BY date16; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +OPTIMIZE TABLE t64 FINAL; + +SELECT * FROM t64 WHERE date16 != t_date16; +SELECT * FROM t64 WHERE date_32 != t_date32; + +DROP TABLE t64; From 2e7ce5b0e208c91874d44eb0c828a1e01544a387 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 22 Jan 2024 16:24:43 +0100 Subject: [PATCH 117/884] Updated settings ptr and fetching of client from Disk & ObjectStorage --- src/Backups/BackupIO_AzureBlobStorage.cpp | 32 ++++++++----------- src/Backups/BackupIO_AzureBlobStorage.h | 4 +-- .../AzureBlobStorage/AzureObjectStorage.h | 7 +++- .../Cached/CachedObjectStorage.h | 8 +++++ src/Disks/ObjectStorages/IObjectStorage.h | 13 ++++++++ .../copyAzureBlobStorageFile.cpp | 22 ++++++------- .../copyAzureBlobStorageFile.h | 4 +-- src/Storages/StorageAzureBlob.cpp | 2 +- 8 files changed, 57 insertions(+), 35 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 8c6c1040eec..fca324869ae 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -37,13 +37,12 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); - settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); - auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupReaderAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr), + StorageAzureBlob::createSettings(context_), configuration_.container); - client = object_storage->getClient(); + client = object_storage->getAzureBlobStorageClient(); + settings = object_storage->getSettings(); } BackupReaderAzureBlobStorage::~BackupReaderAzureBlobStorage() = default; @@ -89,8 +88,8 @@ std::unique_ptr BackupReaderAzureBlobStorage::readFile(const key = file_name; } return std::make_unique( - client.get(), key, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + client.get(), key, read_settings, settings.get()->max_single_read_retries, + settings.get()->max_single_download_retries); } void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, size_t file_size, bool encrypted_in_backup, @@ -98,10 +97,8 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, { LOG_INFO(&Poco::Logger::get("BackupReaderAzureBlobStorage"), "Enter copyFileToDisk"); - /// Use the native copy as a more optimal way to copy a file from AzureBlobStorage to AzureBlobStorage if it's possible. - /// We don't check for `has_throttling` here because the native copy almost doesn't use network. auto destination_data_source_description = destination_disk->getDataSourceDescription(); - if (destination_data_source_description.sameKind(data_source_description) + if ((destination_data_source_description.type == DataSourceType::AzureBlobStorage) && (destination_data_source_description.is_encrypted == encrypted_in_backup)) { LOG_TRACE(log, "Copying {} from AzureBlobStorage to disk {}", path_in_backup, destination_disk->getName()); @@ -115,7 +112,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, copyAzureBlobStorageFile( client, - reinterpret_cast(destination_disk->getObjectStorage().get())->getClient(), + destination_disk->getObjectStorage()->getAzureBlobStorageClient(), configuration.container, fs::path(configuration.blob_path) / path_in_backup, 0, @@ -150,13 +147,12 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); - settings = StorageAzureBlob::createSettingsAsSharedPtr(context_); - auto settings_as_unique_ptr = StorageAzureBlob::createSettings(context_); object_storage = std::make_unique("BackupWriterAzureBlobStorage", std::move(client_ptr), - std::move(settings_as_unique_ptr), + StorageAzureBlob::createSettings(context_), configuration_.container); - client = object_storage->getClient(); + client = object_storage->getAzureBlobStorageClient(); + settings = object_storage->getSettings(); } void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backup, DiskPtr src_disk, const String & src_path, @@ -172,7 +168,7 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu { LOG_TRACE(log, "Copying file {} from disk {} to AzureBlobStorag", src_path, src_disk->getName()); copyAzureBlobStorageFile( - reinterpret_cast(src_disk->getObjectStorage().get())->getClient(), + src_disk->getObjectStorage()->getAzureBlobStorageClient(), client, /* src_container */ blob_path[1], /* src_path */ blob_path[0], @@ -267,8 +263,8 @@ std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String } return std::make_unique( - client.get(), key, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + client.get(), key, read_settings, settings.get()->max_single_read_retries, + settings.get()->max_single_download_retries); } std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const String & file_name) @@ -285,7 +281,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin return std::make_unique( client.get(), key, - settings->max_single_part_upload_size, + settings.get()->max_single_part_upload_size, DBMS_DEFAULT_BUFFER_SIZE, write_settings); } diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 12bf073cd08..87dc470cdb3 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -31,7 +31,7 @@ private: MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; - std::shared_ptr settings; + MultiVersion settings; }; class BackupWriterAzureBlobStorage : public BackupWriterDefault @@ -60,7 +60,7 @@ private: MultiVersion client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; - std::shared_ptr settings; + MultiVersion settings; }; } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 52d535054ff..a9d082539e6 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -139,7 +139,12 @@ public: bool isRemote() const override { return true; } - MultiVersion & getClient() { return client; } + MultiVersion & getSettings() { return settings; } + + MultiVersion & getAzureBlobStorageClient() override + { + return client; + } private: const String name; diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 4c185db051d..6b0ff8be58a 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -3,6 +3,7 @@ #include #include #include +#include "config.h" namespace Poco { @@ -118,6 +119,13 @@ public: static bool canUseReadThroughCache(const ReadSettings & settings); +#if USE_AZURE_BLOB_STORAGE + MultiVersion & getAzureBlobStorageClient() override + { + return object_storage->getAzureBlobStorageClient(); + } +#endif + private: FileCache::Key getCacheKey(const std::string & path) const; diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index f405be72287..cf113586ddf 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -23,7 +23,12 @@ #include #include #include +#include "config.h" +#if USE_AZURE_BLOB_STORAGE +#include +#include +#endif namespace DB { @@ -212,6 +217,14 @@ public: virtual WriteSettings patchSettings(const WriteSettings & write_settings) const; +#if USE_AZURE_BLOB_STORAGE + virtual MultiVersion & getAzureBlobStorageClient() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "This function is only implemented for AzureBlobStorage"); + } +#endif + + private: mutable std::mutex throttlers_mutex; ThrottlerPtr remote_read_throttler; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 272be914cc1..bb8702e9b41 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -49,7 +49,7 @@ namespace size_t total_size_, const String & dest_container_, const String & dest_blob_, - std::shared_ptr settings_, + MultiVersion settings_, const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, bool for_disk_azure_blob_storage_, @@ -65,7 +65,7 @@ namespace , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) , log(log_) - , max_single_part_upload_size(settings_->max_single_part_upload_size) + , max_single_part_upload_size(settings_.get()->max_single_part_upload_size) { } @@ -78,7 +78,7 @@ namespace size_t total_size; const String & dest_container; const String & dest_blob; - std::shared_ptr settings; + MultiVersion settings; const std::optional> & object_metadata; ThreadPoolCallbackRunner schedule; bool for_disk_azure_blob_storage; @@ -114,9 +114,9 @@ namespace if (!total_size) throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - auto max_part_number = settings->max_part_number; - auto min_upload_part_size = settings->min_upload_part_size; - auto max_upload_part_size = settings->max_upload_part_size; + auto max_part_number = settings.get()->max_part_number; + auto min_upload_part_size = settings.get()->min_upload_part_size; + auto max_upload_part_size = settings.get()->max_upload_part_size; if (!max_part_number) throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); @@ -333,7 +333,7 @@ void copyDataToAzureBlobStorageFile( MultiVersion & dest_client, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) @@ -352,14 +352,14 @@ void copyAzureBlobStorageFile( size_t size, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const ReadSettings & read_settings, const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - if (settings->use_native_copy) + if (settings.get()->use_native_copy) { ProfileEvents::increment(ProfileEvents::AzureCopyObject); if (for_disk_azure_blob_storage) @@ -393,8 +393,8 @@ void copyAzureBlobStorageFile( LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client.get(), src_blob, read_settings, settings->max_single_read_retries, - settings->max_single_download_retries); + return std::make_unique(src_client.get(), src_blob, read_settings, settings.get()->max_single_read_retries, + settings.get()->max_single_download_retries); }; UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index b022151d32d..491f7cd7176 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -29,7 +29,7 @@ void copyAzureBlobStorageFile( size_t src_size, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const ReadSettings & read_settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, @@ -48,7 +48,7 @@ void copyDataToAzureBlobStorageFile( MultiVersion & client, const String & dest_container, const String & dest_blob, - std::shared_ptr settings, + MultiVersion settings, const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, bool for_disk_azure_blob_storage = false); diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 7a40d2dcb73..e54838c7a61 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -1214,7 +1214,7 @@ StorageAzureBlobSource::ReaderHolder StorageAzureBlobSource::createReader() QueryPipelineBuilder builder; std::shared_ptr source; std::unique_ptr read_buf; - std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files + std::optional num_rows_from_cache = need_only_count && getContext()->getSettingsRef().use_cache_for_count_from_files ? tryGetNumRowsFromCache(path_with_metadata) : std::nullopt; if (num_rows_from_cache) { From 7b235fe643e744b643be6e4d0788de63cae4a07c Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 22 Jan 2024 22:59:59 +0200 Subject: [PATCH 118/884] #31363 - remove schema delimiter setting and add test 00937_format_schema_rows_template.sh and reference --- src/Formats/FormatFactory.cpp | 1 - .../Impl/TemplateBlockOutputFormat.cpp | 15 +++------ ...0937_format_schema_rows_template.reference | 4 +++ .../00937_format_schema_rows_template.sh | 32 +++++++++++++++++++ 4 files changed, 40 insertions(+), 12 deletions(-) create mode 100644 tests/queries/0_stateless/00937_format_schema_rows_template.reference create mode 100755 tests/queries/0_stateless/00937_format_schema_rows_template.sh diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 6f7f758621c..184778a9fa9 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -167,7 +167,6 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; format_settings.template_settings.row_format_schema = settings.format_schema_rows_template; - format_settings.template_settings.row_between_delimiter_schema = settings.format_schema_rows_between_delimiter; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 495cc0e541e..99a7f59c09e 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -221,21 +221,14 @@ void registerOutputFormatTemplate(FormatFactory & factory) }; if (settings.template_settings.row_format.empty()) { - if (settings.template_settings.row_format_schema.empty()) - { - throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); - } - else - { - row_format = ParsedTemplateFormatString(); - row_format.parse(settings.template_settings.row_format_schema,idx_by_name); - } + row_format = ParsedTemplateFormatString(); + row_format.parse(settings.template_settings.row_format_schema,idx_by_name); } else { - if (settings.template_settings.row_format_schema.empty()) + if (!settings.template_settings.row_format_schema.empty()) { - throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template"); + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template, but not both"); } row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.reference b/tests/queries/0_stateless/00937_format_schema_rows_template.reference new file mode 100644 index 00000000000..167f16ec55f --- /dev/null +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.reference @@ -0,0 +1,4 @@ +Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number of Likes: 456, Date: 2016-01-02; +Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; +Question: 'Is it opensource', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 + diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh new file mode 100755 index 00000000000..651e3618f83 --- /dev/null +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash +# shellcheck disable=SC2016 + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +# Test format_schema_rows_template setting + +$CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; +$CLICKHOUSE_CLIENT --query="CREATE TABLE template (question String, answer String, likes UInt64, date Date) ENGINE = Memory"; +$CLICKHOUSE_CLIENT --query="INSERT INTO template VALUES +('How awesome is clickhouse?', 'unbelievably awesome!', 456, '2016-01-02'),\ +('How fast is clickhouse?', 'Lightning fast!', 9876543210, '2016-01-03'),\ +('Is it opensource', 'of course it is!', 789, '2016-01-04')"; + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'"; + +echo -e "\n" + +# Test that if both format_schema_rows_template setting and format_template_row are provided, error is thrown + +echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > "$CURDIR"/00937_template_output_format_row.tmp +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ +format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'"; -- { serverError 474 } + +$CLICKHOUSE_CLIENT --query="DROP TABLE template"; +rm "$CURDIR"/00937_template_output_format_row.tmp \ No newline at end of file From 3832a8261a19004e88a32b4bab39f6b46b14daa6 Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 22 Jan 2024 23:20:02 +0200 Subject: [PATCH 119/884] #31363 - update documentation for En and Ru --- docs/en/interfaces/formats.md | 4 +++- docs/ru/interfaces/formats.md | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a11c3e5ef19..fd44fbf4462 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -253,7 +253,7 @@ This format is also available under the name `TSVRawWithNamesAndNames`. This format allows specifying a custom format string with placeholders for values with a specified escaping rule. -It uses settings `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_template_resultset`, `format_template_row` (`format_schema_rows_template`), `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Setting `format_template_row` specifies the path to the file containing format strings for rows with the following syntax: @@ -279,6 +279,8 @@ the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quo `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluste, or if the format is trivial then `format_schema_rows_template` can be used to pass the template string directly in the query, rather than a path to the file which contains it. + The `format_template_rows_between_delimiter` setting specifies the delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) Setting `format_template_resultset` specifies the path to the file, which contains a format string for resultset. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index b4794b02743..8f8197e2221 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -201,7 +201,7 @@ SELECT * FROM nestedt FORMAT TSV Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. -Для этого используются настройки `format_template_resultset`, `format_template_row`, `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) +Для этого используются настройки `format_template_resultset`, `format_template_row` (`format_schema_rows_template`), `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) Настройка `format_template_row` задаёт путь к файлу, содержащему форматную строку для строк таблицы, которая должна иметь вид: @@ -227,6 +227,8 @@ SELECT * FROM nestedt FORMAT TSV `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` +В тех случаях, когда не удобно или не возможно указать произвольную форматную строку в файле, можно использовать `format_schema_rows_template` указать произвольную форматную строку в запросе. + Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: From e78eb41264ebb37d3fd813850a3e55ce7690ecea Mon Sep 17 00:00:00 2001 From: MyroTk <44327070+MyroTk@users.noreply.github.com> Date: Mon, 22 Jan 2024 15:19:31 -0800 Subject: [PATCH 120/884] Update Dockerfile --- docker/test/integration/runner/Dockerfile | 57 +++++++++++------------ 1 file changed, 28 insertions(+), 29 deletions(-) diff --git a/docker/test/integration/runner/Dockerfile b/docker/test/integration/runner/Dockerfile index 2a81db78a3d..dbf90f9b810 100644 --- a/docker/test/integration/runner/Dockerfile +++ b/docker/test/integration/runner/Dockerfile @@ -63,47 +63,46 @@ RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | apt-key add - \ RUN python3 -m pip install --no-cache-dir \ - PyMySQL \ - aerospike==11.1.0 \ - asyncio \ + PyMySQL==1.1.0 \ + asyncio==3.4.3 \ avro==1.10.2 \ - azure-storage-blob \ - boto3 \ - cassandra-driver \ + azure-storage-blob==12.19.0 \ + boto3==1.34.24 \ + cassandra-driver==3.29.0 \ confluent-kafka==1.9.2 \ delta-spark==2.3.0 \ - dict2xml \ - dicttoxml \ + dict2xml==1.7.4 \ + dicttoxml==1.7.16 \ docker==6.1.3 \ docker-compose==1.29.2 \ - grpcio \ - grpcio-tools \ - kafka-python \ - kazoo \ - lz4 \ - minio \ - nats-py \ - protobuf \ + grpcio==1.60.0 \ + grpcio-tools==1.60.0 \ + kafka-python==2.0.2 \ + kazoo==2.9.0 \ + lz4==4.3.3 \ + minio==7.2.3 \ + nats-py==2.6.0 \ + protobuf==4.25.2 \ psycopg2-binary==2.9.6 \ - pyhdfs \ + pyhdfs==0.3.1 \ pymongo==3.11.0 \ pyspark==3.3.2 \ - pytest \ + pytest==7.4.4 \ pytest-order==1.0.0 \ - pytest-random \ - pytest-repeat \ - pytest-timeout \ - pytest-xdist \ + pytest-random==0.2 \ + pytest-repeat==0.9.3 \ + pytest-timeout==2.2.0 \ + pytest-xdist==3.5.0 \ pytest-reportlog==0.4.0 \ - pytz \ + pytz==2023.3.post1 \ pyyaml==5.3.1 \ - redis \ - requests-kerberos \ + redis==5.0.1 \ + requests-kerberos==0.14.0 \ tzlocal==2.1 \ - retry \ - bs4 \ - lxml \ - urllib3 + retry==0.9.2 \ + bs4==0.0.2 \ + lxml==5.1.0 \ + urllib3==2.0.7 # bs4, lxml are for cloud tests, do not delete # Hudi supports only spark 3.3.*, not 3.4 From 276ccd3d47be40b79abbaf7734f557d578501b19 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 23 Jan 2024 07:18:14 +0200 Subject: [PATCH 121/884] empty commit to restart CI checks From 992d859e726895dadc9fbab1ebf99acd4b29881c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Tue, 23 Jan 2024 14:16:14 +0100 Subject: [PATCH 122/884] Fix style check --- src/Disks/ObjectStorages/IObjectStorage.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index cf113586ddf..b7db353fb6a 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -23,6 +23,7 @@ #include #include #include +#include #include "config.h" #if USE_AZURE_BLOB_STORAGE @@ -33,6 +34,11 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + class ReadBufferFromFileBase; class WriteBufferFromFileBase; From 8e0aea301ee4b416d6bb4bcfdf664756ebff55ec Mon Sep 17 00:00:00 2001 From: Dmitry Novik Date: Tue, 23 Jan 2024 14:29:26 +0000 Subject: [PATCH 123/884] Analyzer: Add cast for ConstantNode from constant folding --- src/Analyzer/ConstantNode.cpp | 5 ++++- tests/analyzer_tech_debt.txt | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/ConstantNode.cpp b/src/Analyzer/ConstantNode.cpp index cb05e6ed4e3..69bed3dbe90 100644 --- a/src/Analyzer/ConstantNode.cpp +++ b/src/Analyzer/ConstantNode.cpp @@ -128,7 +128,10 @@ ASTPtr ConstantNode::toASTImpl(const ConvertToASTOptions & options) const } } - if (need_to_add_cast_function) + // Add cast if constant was created as a result of constant folding. + // Constant folding may lead to type transformation and literal on shard + // may have a different type. + if (need_to_add_cast_function || source_expression != nullptr) { auto constant_type_name_ast = std::make_shared(constant_value->getType()->getName()); return makeASTFunction("_CAST", std::move(constant_value_ast), std::move(constant_type_name_ast)); diff --git a/tests/analyzer_tech_debt.txt b/tests/analyzer_tech_debt.txt index 4643d109c3d..dd747fff7df 100644 --- a/tests/analyzer_tech_debt.txt +++ b/tests/analyzer_tech_debt.txt @@ -6,7 +6,6 @@ 01155_rename_move_materialized_view 01214_test_storage_merge_aliases_with_where 01244_optimize_distributed_group_by_sharding_key -01268_shard_avgweighted 01495_subqueries_in_with_statement 01560_merge_distributed_join 01584_distributed_buffer_cannot_find_column From 617cc514b74a610ff1f314f911bfb78c779f0b4b Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 Jan 2024 22:55:50 +0000 Subject: [PATCH 124/884] Try to detect file format automatically during schema inference if it's unknown --- docs/en/interfaces/schema-inference.md | 48 +- programs/local/LocalServer.cpp | 2 +- programs/obfuscator/Obfuscator.cpp | 2 +- src/Client/ClientBase.cpp | 2 +- src/Common/ErrorCodes.cpp | 1 + src/Databases/DatabaseFilesystem.cpp | 15 +- src/Formats/FormatFactory.cpp | 86 ++-- src/Formats/FormatFactory.h | 28 +- src/Formats/ReadSchemaUtils.cpp | 423 +++++++++++++----- src/Formats/ReadSchemaUtils.h | 76 +++- src/IO/Archives/IArchiveReader.h | 1 + src/IO/Archives/LibArchiveReader.cpp | 9 + src/IO/Archives/LibArchiveReader.h | 1 + src/IO/Archives/ZipArchiveReader.cpp | 9 + src/IO/Archives/ZipArchiveReader.h | 1 + src/Processors/Formats/ISchemaReader.cpp | 2 +- src/Processors/Formats/ISchemaReader.h | 4 +- .../Impl/JSONColumnsBlockInputFormatBase.cpp | 2 +- .../Impl/JSONColumnsBlockInputFormatBase.h | 2 +- .../Formats/Impl/JSONRowInputFormat.cpp | 39 +- .../Formats/Impl/JSONRowInputFormat.h | 5 +- .../Formats/Impl/TemplateRowInputFormat.cpp | 4 +- .../Formats/Impl/ValuesBlockInputFormat.h | 2 +- src/Server/TCPHandler.cpp | 1 - src/Storages/DataLakes/IStorageDataLake.h | 14 +- .../DataLakes/Iceberg/StorageIceberg.cpp | 4 +- .../DataLakes/Iceberg/StorageIceberg.h | 8 +- src/Storages/HDFS/StorageHDFS.cpp | 142 ++++-- src/Storages/HDFS/StorageHDFS.h | 17 +- src/Storages/HDFS/StorageHDFSCluster.cpp | 22 +- src/Storages/HDFS/StorageHDFSCluster.h | 6 +- src/Storages/IStorageCluster.cpp | 7 +- src/Storages/IStorageCluster.h | 7 +- src/Storages/S3Queue/StorageS3Queue.cpp | 8 +- src/Storages/StorageAzureBlob.cpp | 159 +++++-- src/Storages/StorageAzureBlob.h | 36 +- src/Storages/StorageAzureBlobCluster.cpp | 22 +- src/Storages/StorageAzureBlobCluster.h | 5 +- src/Storages/StorageFile.cpp | 421 +++++++++++------ src/Storages/StorageFile.h | 36 +- src/Storages/StorageFileCluster.cpp | 32 +- src/Storages/StorageFileCluster.h | 8 +- src/Storages/StorageS3.cpp | 191 ++++++-- src/Storages/StorageS3.h | 28 +- src/Storages/StorageS3Cluster.cpp | 31 +- src/Storages/StorageS3Cluster.h | 5 +- src/Storages/StorageURL.cpp | 190 ++++++-- src/Storages/StorageURL.h | 36 +- src/Storages/StorageURLCluster.cpp | 40 +- src/Storages/StorageURLCluster.h | 10 +- src/Storages/StorageXDBC.cpp | 4 +- src/Storages/StorageXDBC.h | 4 +- src/TableFunctions/ITableFunctionCluster.h | 5 +- src/TableFunctions/ITableFunctionFileLike.cpp | 36 +- src/TableFunctions/ITableFunctionFileLike.h | 4 +- .../TableFunctionAzureBlobStorage.cpp | 124 +++-- .../TableFunctionAzureBlobStorage.h | 2 +- .../TableFunctionAzureBlobStorageCluster.cpp | 6 +- src/TableFunctions/TableFunctionFile.cpp | 9 +- src/TableFunctions/TableFunctionFile.h | 2 +- .../TableFunctionFileCluster.cpp | 3 +- src/TableFunctions/TableFunctionFormat.cpp | 45 +- src/TableFunctions/TableFunctionHDFS.cpp | 2 + .../TableFunctionHDFSCluster.cpp | 3 +- src/TableFunctions/TableFunctionS3.cpp | 90 +++- src/TableFunctions/TableFunctionS3.h | 2 +- src/TableFunctions/TableFunctionS3Cluster.cpp | 6 +- src/TableFunctions/TableFunctionURL.cpp | 39 +- src/TableFunctions/TableFunctionURL.h | 5 +- .../TableFunctionURLCluster.cpp | 3 +- tests/integration/test_file_cluster/test.py | 88 ++++ tests/integration/test_s3_cluster/test.py | 34 +- .../test_storage_azure_blob_storage/test.py | 70 +++ .../test_cluster.py | 69 +++ tests/integration/test_storage_hdfs/test.py | 68 +++ tests/integration/test_storage_s3/test.py | 54 +++ .../02969_auto_format_detection.reference | 123 +++++ .../02969_auto_format_detection.sh | 46 ++ 78 files changed, 2433 insertions(+), 763 deletions(-) create mode 100644 tests/queries/0_stateless/02969_auto_format_detection.reference create mode 100755 tests/queries/0_stateless/02969_auto_format_detection.sh diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 4db1d53987a..d255688da1f 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -13,7 +13,7 @@ can control it. Schema inference is used when ClickHouse needs to read the data in a specific data format and the structure is unknown. -## Table functions [file](../sql-reference/table-functions/file.md), [s3](../sql-reference/table-functions/s3.md), [url](../sql-reference/table-functions/url.md), [hdfs](../sql-reference/table-functions/hdfs.md). +## Table functions [file](../sql-reference/table-functions/file.md), [s3](../sql-reference/table-functions/s3.md), [url](../sql-reference/table-functions/url.md), [hdfs](../sql-reference/table-functions/hdfs.md), [azureBlobStorage](../sql-reference/table-functions/azureBlobStorage.md). These table functions have the optional argument `structure` with the structure of input data. If this argument is not specified or set to `auto`, the structure will be inferred from the data. @@ -55,7 +55,7 @@ DESCRIBE file('hobbies.jsonl') └─────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` -## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md) +## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md), [azureBlobStorage](./engines/table-engines/integrations/azureBlobStorage.md) If the list of columns is not specified in `CREATE TABLE` query, the structure of the table will be inferred automatically from the data. @@ -1061,7 +1061,7 @@ $$) └──────────────┴───────────────┘ ``` -## Values {#values} +### Values {#values} In Values format ClickHouse extracts column value from the row and then parses it using the recursive parser similar to how literals are parsed. @@ -1986,3 +1986,45 @@ Note: - As some of the files may not contain some columns from the resulting schema, union mode is supported only for formats that support reading subset of columns (like JSONEachRow, Parquet, TSVWithNames, etc) and won't work for other formats (like CSV, TSV, JSONCompactEachRow, etc). - If ClickHouse cannot infer the schema from one of the files, the exception will be thrown. - If you have a lot of files, reading schema from all of them can take a lot of time. + + +## Automatic format detection {#autimatic-format-detection} + +If data format is not specified and cannot be determined by the file extension, ClickHouse will try to detect the file format by its content. + +**Examples:** + +Let's say we have `data` with the next content: +`data1`: +``` +"a","b" +1,"Data1" +2,"Data2" +3,"Data3" +``` + +We can inspect and query this file without specifying format or structure: +```sql +:) desc file(data); +``` + +```text +┌─name─┬─type─────────────┐ +│ a │ Nullable(Int64) │ +│ b │ Nullable(String) │ +└──────┴──────────────────┘ +``` + +```sql +:) select * from file(data); +``` + +```text +┌─a─┬─b─────┐ +│ 1 │ Data1 │ +│ 2 │ Data2 │ +│ 3 │ Data3 │ +└───┴───────┘ +``` + +**Note:** ClickHouse can detect only some subset of formats and this detection takes some time, it's always better to specify the format explicitly. \ No newline at end of file diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index 4e0b9eeb731..dd96532aadd 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -348,7 +348,7 @@ std::string LocalServer::getInitialCreateTableQuery() /// Use regular file auto file_name = config().getString("table-file"); table_file = quoteString(file_name); - format_from_file_name = FormatFactory::instance().getFormatFromFileName(file_name, false); + format_from_file_name = FormatFactory::instance().getFormatFromFileName(file_name); } auto data_format = backQuoteIfNeed( diff --git a/programs/obfuscator/Obfuscator.cpp b/programs/obfuscator/Obfuscator.cpp index 7e09d5e8046..242e995e466 100644 --- a/programs/obfuscator/Obfuscator.cpp +++ b/programs/obfuscator/Obfuscator.cpp @@ -1310,7 +1310,7 @@ try throw ErrnoException(ErrorCodes::CANNOT_SEEK_THROUGH_FILE, "Input must be seekable file (it will be read twice)"); SingleReadBufferIterator read_buffer_iterator(std::move(file)); - schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, false, context_const); + schema_columns = readSchemaFromFormat(input_format, {}, read_buffer_iterator, context_const); } else { diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index e099aac0de9..01eff0d3e4c 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -1508,7 +1508,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des String current_format = parsed_insert_query->format; if (current_format.empty()) - current_format = FormatFactory::instance().getFormatFromFileName(in_file, true); + current_format = FormatFactory::instance().getFormatFromFileName(in_file); /// Create temporary storage file, to support globs and parallel reading /// StorageFile doesn't support ephemeral/materialized/alias columns. diff --git a/src/Common/ErrorCodes.cpp b/src/Common/ErrorCodes.cpp index 975970bbeeb..01d1d2c679b 100644 --- a/src/Common/ErrorCodes.cpp +++ b/src/Common/ErrorCodes.cpp @@ -592,6 +592,7 @@ M(710, FAULT_INJECTED) \ M(711, FILECACHE_ACCESS_DENIED) \ M(712, TOO_MANY_MATERIALIZED_VIEWS) \ + M(713, CANNOT_DETECT_FORMAT) \ \ M(999, KEEPER_EXCEPTION) \ M(1000, POCO_EXCEPTION) \ diff --git a/src/Databases/DatabaseFilesystem.cpp b/src/Databases/DatabaseFilesystem.cpp index 5564f1d07cf..4105236f0ef 100644 --- a/src/Databases/DatabaseFilesystem.cpp +++ b/src/Databases/DatabaseFilesystem.cpp @@ -146,9 +146,18 @@ StoragePtr DatabaseFilesystem::getTableImpl(const String & name, ContextPtr cont if (!checkTableFilePath(table_path, context_, throw_on_error)) return {}; - auto format = FormatFactory::instance().getFormatFromFileName(table_path, throw_on_error); - if (format.empty()) - return {}; + String format; + if (throw_on_error) + { + format = FormatFactory::instance().getFormatFromFileName(table_path); + } + else + { + auto format_maybe = FormatFactory::instance().tryGetFormatFromFileName(table_path); + if (!format_maybe) + return {}; + format = *format_maybe; + } auto ast_function_ptr = makeASTFunction("file", std::make_shared(table_path), std::make_shared(format)); diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 608f9433d6f..cacb5a510da 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -39,7 +39,7 @@ const FormatFactory::Creators & FormatFactory::getCreators(const String & name) throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); } -FormatSettings getFormatSettings(ContextPtr context) +FormatSettings getFormatSettings(const ContextPtr & context) { const auto & settings = context->getSettingsRef(); @@ -47,7 +47,7 @@ FormatSettings getFormatSettings(ContextPtr context) } template -FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) +FormatSettings getFormatSettings(const ContextPtr & context, const Settings & settings) { FormatSettings format_settings; @@ -253,16 +253,16 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) return format_settings; } -template FormatSettings getFormatSettings(ContextPtr context, const FormatFactorySettings & settings); +template FormatSettings getFormatSettings(const ContextPtr & context, const FormatFactorySettings & settings); -template FormatSettings getFormatSettings(ContextPtr context, const Settings & settings); +template FormatSettings getFormatSettings(const ContextPtr & context, const Settings & settings); InputFormatPtr FormatFactory::getInput( const String & name, ReadBuffer & _buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const std::optional & _format_settings, std::optional _max_parsing_threads, @@ -425,7 +425,7 @@ std::unique_ptr FormatFactory::wrapReadBufferIfNeeded( return res; } -static void addExistingProgressToOutputFormat(OutputFormatPtr format, ContextPtr context) +static void addExistingProgressToOutputFormat(OutputFormatPtr format, const ContextPtr & context) { auto element_id = context->getProcessListElementSafe(); if (element_id) @@ -444,7 +444,7 @@ OutputFormatPtr FormatFactory::getOutputFormatParallelIfPossible( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -482,7 +482,7 @@ OutputFormatPtr FormatFactory::getOutputFormat( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -516,7 +516,7 @@ OutputFormatPtr FormatFactory::getOutputFormat( String FormatFactory::getContentType( const String & name, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & output_getter = getCreators(name).output_creator; @@ -535,7 +535,7 @@ String FormatFactory::getContentType( SchemaReaderPtr FormatFactory::getSchemaReader( const String & name, ReadBuffer & buf, - ContextPtr & context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & schema_reader_creator = dict.at(name).schema_reader_creator; @@ -551,7 +551,7 @@ SchemaReaderPtr FormatFactory::getSchemaReader( ExternalSchemaReaderPtr FormatFactory::getExternalSchemaReader( const String & name, - ContextPtr & context, + const ContextPtr & context, const std::optional & _format_settings) const { const auto & external_schema_reader_creator = dict.at(name).external_schema_reader_creator; @@ -605,7 +605,7 @@ void FormatFactory::markFormatHasNoAppendSupport(const String & name) registerAppendSupportChecker(name, [](const FormatSettings &){ return false; }); } -bool FormatFactory::checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_) +bool FormatFactory::checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional & format_settings_) { auto format_settings = format_settings_ ? *format_settings_ : getFormatSettings(context); auto & append_support_checker = dict[name].append_support_checker; @@ -628,10 +628,10 @@ void FormatFactory::registerFileExtension(const String & extension, const String file_extension_formats[boost::to_lower_copy(extension)] = format_name; } -String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_found) +std::optional FormatFactory::tryGetFormatFromFileName(String file_name) { if (file_name == "stdin") - return getFormatFromFileDescriptor(STDIN_FILENO); + return tryGetFormatFromFileDescriptor(STDIN_FILENO); CompressionMethod compression_method = chooseCompressionMethod(file_name, ""); if (CompressionMethod::None != compression_method) @@ -643,43 +643,53 @@ String FormatFactory::getFormatFromFileName(String file_name, bool throw_if_not_ auto pos = file_name.find_last_of('.'); if (pos == String::npos) - { - if (throw_if_not_found) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); - return ""; - } + return std::nullopt; String file_extension = file_name.substr(pos + 1, String::npos); boost::algorithm::to_lower(file_extension); auto it = file_extension_formats.find(file_extension); if (it == file_extension_formats.end()) - { - if (throw_if_not_found) - throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the file format by it's extension"); - return ""; - } + return std::nullopt; + return it->second; } -String FormatFactory::getFormatFromFileDescriptor(int fd) +String FormatFactory::getFormatFromFileName(String file_name) +{ + if (auto format = tryGetFormatFromFileName(file_name)) + return *format; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the format of the file {} by it's extension", file_name); +} + +std::optional FormatFactory::tryGetFormatFromFileDescriptor(int fd) { #ifdef OS_LINUX std::string proc_path = fmt::format("/proc/self/fd/{}", fd); char file_path[PATH_MAX] = {'\0'}; if (readlink(proc_path.c_str(), file_path, sizeof(file_path) - 1) != -1) - return getFormatFromFileName(file_path, false); - return ""; + return tryGetFormatFromFileName(file_path); + return std::nullopt; #elif defined(OS_DARWIN) char file_path[PATH_MAX] = {'\0'}; if (fcntl(fd, F_GETPATH, file_path) != -1) - return getFormatFromFileName(file_path, false); - return ""; + return tryGetFormatFromFileName(file_path, false); + return std::nullopt; #else (void)fd; - return ""; + return std::nullopt; #endif } +String FormatFactory::getFormatFromFileDescriptor(int fd) +{ + if (auto format = tryGetFormatFromFileDescriptor(fd)) + return *format; + + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Cannot determine the format of the data by the file descriptor {}", fd); +} + + void FormatFactory::registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine) { auto & target = dict[name].file_segmentation_engine_creator; @@ -765,7 +775,7 @@ void FormatFactory::registerAdditionalInfoForSchemaCacheGetter( target = std::move(additional_info_for_schema_cache_getter); } -String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional & format_settings_) +String FormatFactory::getAdditionalInfoForSchemaCache(const String & name, const ContextPtr & context, const std::optional & format_settings_) { const auto & additional_info_getter = getCreators(name).additional_info_for_schema_cache_getter; if (!additional_info_getter) @@ -810,7 +820,7 @@ bool FormatFactory::checkIfOutputFormatPrefersLargeBlocks(const String & name) c return target.prefers_large_blocks; } -bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const +bool FormatFactory::checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const { if (name == "Parquet" && context->getSettingsRef().input_format_parquet_preserve_order) return false; @@ -825,6 +835,18 @@ void FormatFactory::checkFormatName(const String & name) const throw Exception(ErrorCodes::UNKNOWN_FORMAT, "Unknown format {}", name); } +std::vector FormatFactory::getAllInputFormats() const +{ + std::vector input_formats; + for (const auto & [format_name, creators] : dict) + { + if (creators.input_creator || creators.random_access_input_creator) + input_formats.push_back(format_name); + } + + return input_formats; +} + FormatFactory & FormatFactory::instance() { static FormatFactory ret; diff --git a/src/Formats/FormatFactory.h b/src/Formats/FormatFactory.h index 9670c690456..165a20f7c4d 100644 --- a/src/Formats/FormatFactory.h +++ b/src/Formats/FormatFactory.h @@ -48,10 +48,10 @@ using RowOutputFormatPtr = std::shared_ptr; template struct Memory; -FormatSettings getFormatSettings(ContextPtr context); +FormatSettings getFormatSettings(const ContextPtr & context); template -FormatSettings getFormatSettings(ContextPtr context, const T & settings); +FormatSettings getFormatSettings(const ContextPtr & context, const T & settings); /** Allows to create an IInputFormat or IOutputFormat by the name of the format. * Note: format and compression are independent things. @@ -161,7 +161,7 @@ public: const String & name, ReadBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const std::optional & format_settings = std::nullopt, std::optional max_parsing_threads = std::nullopt, @@ -178,30 +178,30 @@ public: const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; OutputFormatPtr getOutputFormat( const String & name, WriteBuffer & buf, const Block & sample, - ContextPtr context, + const ContextPtr & context, const std::optional & _format_settings = std::nullopt) const; String getContentType( const String & name, - ContextPtr context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; SchemaReaderPtr getSchemaReader( const String & name, ReadBuffer & buf, - ContextPtr & context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; ExternalSchemaReaderPtr getExternalSchemaReader( const String & name, - ContextPtr & context, + const ContextPtr & context, const std::optional & format_settings = std::nullopt) const; void registerFileSegmentationEngine(const String & name, FileSegmentationEngine file_segmentation_engine); @@ -216,7 +216,7 @@ public: /// registerAppendSupportChecker with append_support_checker that always returns true. void markFormatHasNoAppendSupport(const String & name); - bool checkIfFormatSupportAppend(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + bool checkIfFormatSupportAppend(const String & name, const ContextPtr & context, const std::optional & format_settings_ = std::nullopt); /// Register format by its name. void registerInputFormat(const String & name, InputCreator input_creator); @@ -225,8 +225,10 @@ public: /// Register file extension for format void registerFileExtension(const String & extension, const String & format_name); - String getFormatFromFileName(String file_name, bool throw_if_not_found = false); + String getFormatFromFileName(String file_name); + std::optional tryGetFormatFromFileName(String file_name); String getFormatFromFileDescriptor(int fd); + std::optional tryGetFormatFromFileDescriptor(int fd); /// Register schema readers for format its name. void registerSchemaReader(const String & name, SchemaReaderCreator schema_reader_creator); @@ -244,16 +246,18 @@ public: bool checkIfFormatHasAnySchemaReader(const String & name) const; bool checkIfOutputFormatPrefersLargeBlocks(const String & name) const; - bool checkParallelizeOutputAfterReading(const String & name, ContextPtr context) const; + bool checkParallelizeOutputAfterReading(const String & name, const ContextPtr & context) const; void registerAdditionalInfoForSchemaCacheGetter(const String & name, AdditionalInfoForSchemaCacheGetter additional_info_for_schema_cache_getter); - String getAdditionalInfoForSchemaCache(const String & name, ContextPtr context, const std::optional & format_settings_ = std::nullopt); + String getAdditionalInfoForSchemaCache(const String & name, const ContextPtr & context, const std::optional & format_settings_ = std::nullopt); const FormatsDictionary & getAllFormats() const { return dict; } + std::vector getAllInputFormats() const; + bool isInputFormat(const String & name) const; bool isOutputFormat(const String & name) const; diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 43931be3449..b4fba7b9ce6 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -4,6 +4,7 @@ #include #include #include +#include namespace DB { @@ -14,7 +15,9 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int ONLY_NULLS_WHILE_READING_SCHEMA; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int TYPE_MISMATCH; + extern const int LOGICAL_ERROR; } static std::optional getOrderedColumnsList(const NamesAndTypesList & columns_list, const Names & columns_order_hint) @@ -43,48 +46,86 @@ bool isRetryableSchemaInferenceError(int code) return code == ErrorCodes::EMPTY_DATA_PASSED || code == ErrorCodes::ONLY_NULLS_WHILE_READING_SCHEMA; } -ColumnsDescription readSchemaFromFormat( - const String & format_name, +/// Order of formats to try in automatic format detection. +/// If we can successfully detect some format, we won't try next ones. +static const std::vector & getFormatsOrderForDetection() +{ + static const std::vector formats_order = + { + "Parquet", + "ORC", + "Arrow", + "ArrowStream", + "Avro", + "AvroConfluent", + "Npy", + "Native", + "BSONEachRow", + "JSONCompact", + "Values", + "TSKV", + "JSONObjectEachRow", + "JSONColumns", + "JSONCompactColumns", + "JSONCompact", + "JSON", + }; + + return formats_order; +} + +/// The set of similar formats to try in automatic format detection. +/// We will try all formats from this set and then choose the best one +/// according to inferred schema. +static const std::vector & getSimilarFormatsSetForDetection() +{ + static const std::vector formats_order = + { + "TSV", + "CSV", + }; + + return formats_order; +} + +std::pair readSchemaFromFormatImpl( + std::optional format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context, - std::unique_ptr & buf) + const ContextPtr & context) try { NamesAndTypesList names_and_types; SchemaInferenceMode mode = context->getSettingsRef().schema_inference_mode; - if (mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format_name, context, format_settings)) + if (format_name && mode == SchemaInferenceMode::UNION && !FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(*format_name, context, format_settings)) { String additional_message; /// Better exception message for WithNames(AndTypes) formats. - if (format_name.ends_with("WithNames") || format_name.ends_with("WithNamesAndTypes")) + if (format_name->ends_with("WithNames") || format_name->ends_with("WithNamesAndTypes")) additional_message = " (formats -WithNames(AndTypes) support reading subset of columns only when setting input_format_with_names_use_header is enabled)"; - throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", format_name, additional_message); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "UNION schema inference mode is not supported for format {}, because it doesn't support reading subset of columns{}", *format_name, additional_message); } - if (FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format_name)) + if (format_name && FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format_name)) { - auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(format_name, context, format_settings); + auto external_schema_reader = FormatFactory::instance().getExternalSchemaReader(*format_name, context, format_settings); try { - names_and_types = external_schema_reader->readSchema(); + return {ColumnsDescription(external_schema_reader->readSchema()), *format_name}; } catch (Exception & e) { e.addMessage( - fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + fmt::format("The table structure cannot be extracted from a {} format file. You can specify the structure manually", *format_name)); throw; } } - else if (FormatFactory::instance().checkIfFormatHasSchemaReader(format_name)) - { - if (mode == SchemaInferenceMode::UNION) - retry = false; + if (!format_name || FormatFactory::instance().checkIfFormatHasSchemaReader(*format_name)) + { + IReadBufferIterator::Data iterator_data; std::vector> schemas_for_union_mode; - std::optional cached_columns; std::string exception_messages; SchemaReaderPtr schema_reader; size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference @@ -94,45 +135,71 @@ try size_t iterations = 0; while (true) { + /// When we finish working with current buffer we should put it back to iterator. + SCOPE_EXIT(if (iterator_data.buf) read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf))); bool is_eof = false; try { - read_buffer_iterator.setPreviousReadBuffer(std::move(buf)); - std::tie(buf, cached_columns) = read_buffer_iterator.next(); - if (cached_columns) + iterator_data = read_buffer_iterator.next(); + + /// Read buffer iterator can determine the data format if it's unknown. + /// For example by scanning schema cache or by finding new file with format extension. + if (!format_name && iterator_data.format_name) { + format_name = *iterator_data.format_name; + read_buffer_iterator.setFormatName(*iterator_data.format_name); + } + + if (iterator_data.cached_columns) + { + /// If we have schema in cache, we must also know the format. + if (!format_name) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Schema from cache was returned, but format name is unknown"); + if (mode == SchemaInferenceMode::DEFAULT) - return *cached_columns; - schemas_for_union_mode.emplace_back(cached_columns->getAll(), read_buffer_iterator.getLastFileName()); + { + read_buffer_iterator.setResultingSchema(*iterator_data.cached_columns); + return {*iterator_data.cached_columns, *format_name}; + } + + schemas_for_union_mode.emplace_back(iterator_data.cached_columns->getAll(), read_buffer_iterator.getLastFileName()); continue; } - if (!buf) + if (!iterator_data.buf) break; /// We just want to check for eof, but eof() can be pretty expensive. /// So we use getFileSize() when available, which has better worst case. /// (For remote files, typically eof() would read 1 MB from S3, which may be much /// more than what the schema reader and even data reader will read). - auto size = tryGetFileSizeFromReadBuffer(*buf); + auto size = tryGetFileSizeFromReadBuffer(*iterator_data.buf); if (size.has_value()) is_eof = *size == 0; else - is_eof = buf->eof(); + is_eof = iterator_data.buf->eof(); } catch (Exception & e) { - e.addMessage( - fmt::format("Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); + if (format_name) + e.addMessage(fmt::format("The table structure cannot be extracted from a {} format file. You can specify the structure manually", *format_name)); + else + e.addMessage("The data format cannot be detected by the contents of the files. You can specify the format manually"); throw; } catch (...) { auto exception_message = getCurrentExceptionMessage(false); + if (format_name) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file:\n{}\nYou can specify the structure manually", + *format_name, + exception_message); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file:\n{}\nYou can specify the structure manually", - format_name, + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files:\n{}\nYou can specify the format manually", exception_message); } @@ -140,91 +207,218 @@ try if (is_eof) { - auto exception_message = fmt::format("Cannot extract table structure from {} format file, file is empty", format_name); + String exception_message; + if (format_name) + exception_message = fmt::format("The table structure cannot be extracted from a {} format file: the file is empty", *format_name); + else + exception_message = fmt::format("The data format cannot be detected by the contents of the files: the file is empty"); - if (!retry) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + if (mode == SchemaInferenceMode::UNION) + { + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files: the file is empty. You can specify the format manually"); - exception_messages += "\n" + exception_message; + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "{}. You can specify the structure manually", exception_message); + } + + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; continue; } - try + if (format_name) { - schema_reader = FormatFactory::instance().getSchemaReader(format_name, *buf, context, format_settings); - schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); - names_and_types = schema_reader->readSchema(); - auto num_rows = schema_reader->readNumberOrRows(); - if (num_rows) - read_buffer_iterator.setNumRowsToLastFile(*num_rows); - - /// In default mode, we finish when schema is inferred successfully from any file. - if (mode == SchemaInferenceMode::DEFAULT) - break; - - if (!names_and_types.empty()) - read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); - schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); - } - catch (...) - { - auto exception_message = getCurrentExceptionMessage(false); - if (schema_reader && mode == SchemaInferenceMode::DEFAULT) + try { - size_t rows_read = schema_reader->getNumRowsRead(); - assert(rows_read <= max_rows_to_read); - max_rows_to_read -= schema_reader->getNumRowsRead(); - size_t bytes_read = buf->count(); - /// We could exceed max_bytes_to_read a bit to complete row parsing. - max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); - if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) - { - exception_message += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " - "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + schema_reader = FormatFactory::instance().getSchemaReader(*format_name, *iterator_data.buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); - if (iterations > 1) + /// In default mode, we finish when schema is inferred successfully from any file. + if (mode == SchemaInferenceMode::DEFAULT) + break; + + if (!names_and_types.empty()) + read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + } + catch (...) + { + auto exception_message = getCurrentExceptionMessage(false); + if (schema_reader && mode == SchemaInferenceMode::DEFAULT) + { + size_t rows_read = schema_reader->getNumRowsRead(); + assert(rows_read <= max_rows_to_read); + max_rows_to_read -= schema_reader->getNumRowsRead(); + size_t bytes_read = iterator_data.buf->count(); + /// We could exceed max_bytes_to_read a bit to complete row parsing. + max_bytes_to_read -= std::min(bytes_read, max_bytes_to_read); + if (rows_read != 0 && (max_rows_to_read == 0 || max_bytes_to_read == 0)) { - exception_messages += "\n" + exception_message; + exception_message + += "\nTo increase the maximum number of rows/bytes to read for structure determination, use setting " + "input_format_max_rows_to_read_for_schema_inference/input_format_max_bytes_to_read_for_schema_inference"; + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; break; } - retry = false; } - } - if (!retry || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) - { - try - { - throw; - } - catch (Exception & e) - { - e.addMessage(fmt::format( - "Cannot extract table structure from {} format file. You can specify the structure manually", format_name)); - throw; - } - catch (...) + if (mode == SchemaInferenceMode::UNION || !isRetryableSchemaInferenceError(getCurrentExceptionCode())) { throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file. " + "The table structure cannot be extracted from a {} format file. " "Error: {}. You can specify the structure manually", - format_name, + *format_name, exception_message); } + + if (!exception_messages.empty()) + exception_messages += "\n"; + exception_messages += exception_message; + } + } + else + { + /// If the format is unknown we try some formats in order and try to apply their schema readers. + /// If we can successfully infer the schema in some format, most likely we can use this format to read this data. + + /// If read_buffer_iterator supports recreation of last buffer, we will recreate it for + /// each format. Otherwise we will use PeekableReadBuffer and will rollback to the + /// beginning of the file before each format. Using PeekableReadBuffer can lead + /// to high memory usage as it will save all the read data from the beginning of the file, + /// especially it will be noticeable for formats like Parquet/ORC/Arrow that do seeks to the + /// end of file. + std::unique_ptr peekable_buf; + bool support_buf_recreation = read_buffer_iterator.supportsLastReadBufferRecreation(); + if (!support_buf_recreation) + { + peekable_buf = std::make_unique(*iterator_data.buf); + peekable_buf->setCheckpoint(); + } + + /// First, try some formats in order. If we successfully inferred the schema for any format, + /// we will use this format. + for (const auto & format_to_detect : getFormatsOrderForDetection()) + { + try + { + schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + names_and_types = schema_reader->readSchema(); + if (names_and_types.empty()) + continue; + + /// We successfully inferred schema from this file using current format. + format_name = format_to_detect; + read_buffer_iterator.setFormatName(format_to_detect); + + auto num_rows = schema_reader->readNumberOrRows(); + if (num_rows) + read_buffer_iterator.setNumRowsToLastFile(*num_rows); + + break; + } + catch (...) + { + /// We failed to infer the schema for this format. + /// Recreate read buffer or rollback to the beginning of the data + /// before trying next format. + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } + } } - exception_messages += "\n" + exception_message; + /// If no format was detected from first set of formats, we try second set. + /// In this set formats are similar and it can happen that data matches some of them. + /// We try to infer schema for all of the formats from this set and then choose the best + /// one according to the inferred schema. + if (!format_name) + { + std::unordered_map format_to_schema; + for (const auto & format_to_detect : getSimilarFormatsSetForDetection()) + { + try + { + schema_reader = FormatFactory::instance().getSchemaReader( + format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); + auto tmp_names_and_types = schema_reader->readSchema(); + /// If schema was inferred successfully for this format, remember it and try next format. + if (!tmp_names_and_types.empty()) + format_to_schema[format_to_detect] = tmp_names_and_types; + } + catch (...) // NOLINT(bugprone-empty-catch) + { + /// Try next format. + } + + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } + } + + /// We choose the format with larger number of columns in inferred schema. + size_t max_number_of_columns = 0; + for (const auto & [format_to_detect, schema] : format_to_schema ) + { + if (schema.size() > max_number_of_columns) + { + names_and_types = schema; + format_name = format_to_detect; + max_number_of_columns = schema.size(); + } + } + + if (format_name) + read_buffer_iterator.setFormatName(*format_name); + } + + if (mode == SchemaInferenceMode::UNION) + { + /// For UNION mode we need to know the schema of each file, + /// if we failed to detect the format, we failed to detect the schema of this file + /// in any format. It doesn't make sense to continue. + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); + + read_buffer_iterator.setSchemaToLastFile(ColumnsDescription(names_and_types)); + schemas_for_union_mode.emplace_back(names_and_types, read_buffer_iterator.getLastFileName()); + } + + if (format_name && mode == SchemaInferenceMode::DEFAULT) + break; } } + if (!format_name) + throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); + /// If we got all schemas from cache, schema_reader can be uninitialized. /// But we still need some stateless methods of ISchemaReader, /// let's initialize it with empty buffer. EmptyReadBuffer empty; if (!schema_reader) - schema_reader = FormatFactory::instance().getSchemaReader(format_name, empty, context, format_settings); + schema_reader = FormatFactory::instance().getSchemaReader(*format_name, empty, context, format_settings); if (mode == SchemaInferenceMode::UNION) { @@ -273,11 +467,23 @@ try } if (names_and_types.empty()) + { + if (iterations <= 1) + { + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file. " + "Error: {}. You can specify the structure manually", + *format_name, + exception_messages); + } + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "All attempts to extract table structure from files failed. " - "Errors:{}\nYou can specify the structure manually", + "Errors:\n{}\nYou can specify the structure manually", exception_messages); + } /// If we have "INSERT SELECT" query then try to order /// columns as they are ordered in table schema for formats @@ -294,22 +500,22 @@ try if (ordered_list) names_and_types = *ordered_list; } + + /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. + names_and_types.erase( + std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), + names_and_types.end()); + + auto columns = ColumnsDescription(names_and_types); + if (mode == SchemaInferenceMode::DEFAULT) + read_buffer_iterator.setResultingSchema(columns); + return {columns, *format_name}; } - else - throw Exception( - ErrorCodes::BAD_ARGUMENTS, - "{} file format doesn't support schema inference. You must specify the structure manually", - format_name); - /// Some formats like CSVWithNames can contain empty column names. We don't support empty column names and further processing can fail with an exception. Let's just remove columns with empty names from the structure. - names_and_types.erase( - std::remove_if(names_and_types.begin(), names_and_types.end(), [](const NameAndTypePair & pair) { return pair.name.empty(); }), - names_and_types.end()); - - auto columns = ColumnsDescription(names_and_types); - if (mode == SchemaInferenceMode::DEFAULT) - read_buffer_iterator.setResultingSchema(columns); - return columns; + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "{} file format doesn't support schema inference. You must specify the structure manually", + *format_name); } catch (Exception & e) { @@ -319,16 +525,21 @@ catch (Exception & e) throw; } - ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context) + const ContextPtr & context) { - std::unique_ptr buf_out; - return readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, retry, context, buf_out); + return readSchemaFromFormatImpl(format_name, format_settings, read_buffer_iterator, context).first; +} + +std::pair detectFormatAndReadSchema( + const std::optional & format_settings, + IReadBufferIterator & read_buffer_iterator, + const ContextPtr & context) +{ + return readSchemaFromFormatImpl(std::nullopt, format_settings, read_buffer_iterator, context); } SchemaCache::Key getKeyForSchemaCache( diff --git a/src/Formats/ReadSchemaUtils.h b/src/Formats/ReadSchemaUtils.h index 6aa8f3f9c4c..bb5e068f696 100644 --- a/src/Formats/ReadSchemaUtils.h +++ b/src/Formats/ReadSchemaUtils.h @@ -7,29 +7,68 @@ namespace DB { +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; +} + struct IReadBufferIterator { virtual ~IReadBufferIterator() = default; - virtual void setPreviousReadBuffer(std::unique_ptr /* buffer */) {} - /// Return read buffer of the next file or cached schema. /// In DEFAULT schema inference mode cached schema can be from any file. /// In UNION mode cached schema can be only from current file. /// When there is no files to process, return pair (nullptr, nullopt) - virtual std::pair, std::optional> next() = 0; + struct Data + { + /// Read buffer of the next file. Can be nullptr if there are no more files + /// or when schema was found in cache. + std::unique_ptr buf; + + /// Schema from cache. + /// In DEFAULT schema inference mode cached schema can be from any file. + /// In UNION mode cached schema can be only from current file. + std::optional cached_columns; + + /// Format of the file if known. + std::optional format_name; + }; + + virtual Data next() = 0; + + /// Set read buffer returned in previous iteration. + virtual void setPreviousReadBuffer(std::unique_ptr /* buffer */) {} + + /// Set number of rows to last file extracted during schema inference. + /// Used for caching number of rows from files metadata during schema inference. virtual void setNumRowsToLastFile(size_t /*num_rows*/) {} /// Set schema inferred from last file. Used for UNION mode to cache schema /// per file. virtual void setSchemaToLastFile(const ColumnsDescription & /*columns*/) {} + /// Set resulting inferred schema. Used for DEFAULT mode to cache schema /// for all files. virtual void setResultingSchema(const ColumnsDescription & /*columns*/) {} + /// Set auto detected format name. + virtual void setFormatName(const String & /*format_name*/) {} + /// Get last processed file name for better exception messages. virtual String getLastFileName() const { return ""; } + + /// Return true if method recreateLastReadBuffer is implemented. + virtual bool supportsLastReadBufferRecreation() const { return false; } + + /// Recreate last read buffer to read data from the same file again. + /// Used to detect format from the file content to avoid + /// copying data. + virtual std::unique_ptr recreateLastReadBuffer() + { + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method recreateLastReadBuffer is not implemented"); + } }; struct SingleReadBufferIterator : public IReadBufferIterator @@ -39,12 +78,22 @@ public: { } - std::pair, std::optional> next() override + Data next() override { if (done) - return {nullptr, {}}; + return {nullptr, {}, std::nullopt}; done = true; - return {std::move(buf), {}}; + return Data{std::move(buf), {}, std::nullopt}; + } + + void setPreviousReadBuffer(std::unique_ptr buf_) override + { + buf = std::move(buf_); + } + + std::unique_ptr releaseBuffer() + { + return std::move(buf); } private: @@ -73,17 +122,16 @@ ColumnsDescription readSchemaFromFormat( const String & format_name, const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context); + const ContextPtr & context); -/// If ReadBuffer is created, it will be written to buf_out. -ColumnsDescription readSchemaFromFormat( - const String & format_name, +/// Try to detect the format of the data and it's schema. +/// It runs schema inference for some set of formats on the same file. +/// If schema reader of some format successfully inferred the schema from +/// some file, we consider that the data is in this format. +std::pair detectFormatAndReadSchema( const std::optional & format_settings, IReadBufferIterator & read_buffer_iterator, - bool retry, - ContextPtr & context, - std::unique_ptr & buf_out); + const ContextPtr & context); SchemaCache::Key getKeyForSchemaCache(const String & source, const String & format, const std::optional & format_settings, const ContextPtr & context); SchemaCache::Keys getKeysForSchemaCache(const Strings & sources, const String & format, const std::optional & format_settings, const ContextPtr & context); diff --git a/src/IO/Archives/IArchiveReader.h b/src/IO/Archives/IArchiveReader.h index 84a1dc21f5b..ee516d2655b 100644 --- a/src/IO/Archives/IArchiveReader.h +++ b/src/IO/Archives/IArchiveReader.h @@ -56,6 +56,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. virtual std::unique_ptr readFile(std::unique_ptr enumerator) = 0; virtual std::unique_ptr nextFile(std::unique_ptr read_buffer) = 0; + virtual std::unique_ptr currentFile(std::unique_ptr read_buffer) = 0; virtual std::vector getAllFiles() = 0; virtual std::vector getAllFiles(NameFilter filter) = 0; diff --git a/src/IO/Archives/LibArchiveReader.cpp b/src/IO/Archives/LibArchiveReader.cpp index 763cd3b171b..eb190f2e0fc 100644 --- a/src/IO/Archives/LibArchiveReader.cpp +++ b/src/IO/Archives/LibArchiveReader.cpp @@ -340,6 +340,15 @@ std::unique_ptr LibArchiveReader::nextFile(std return std::make_unique(std::move(handle)); } +std::unique_ptr LibArchiveReader::currentFile(std::unique_ptr read_buffer) +{ + if (!dynamic_cast(read_buffer.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()"); + auto read_buffer_from_libarchive = std::unique_ptr(static_cast(read_buffer.release())); + auto handle = std::move(*read_buffer_from_libarchive).releaseHandle(); + return std::make_unique(std::move(handle)); +} + std::vector LibArchiveReader::getAllFiles() { return getAllFiles({}); diff --git a/src/IO/Archives/LibArchiveReader.h b/src/IO/Archives/LibArchiveReader.h index 3dadd710089..c4b08d8ddf7 100644 --- a/src/IO/Archives/LibArchiveReader.h +++ b/src/IO/Archives/LibArchiveReader.h @@ -40,6 +40,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; std::unique_ptr nextFile(std::unique_ptr read_buffer) override; + std::unique_ptr currentFile(std::unique_ptr read_buffer) override; std::vector getAllFiles() override; std::vector getAllFiles(NameFilter filter) override; diff --git a/src/IO/Archives/ZipArchiveReader.cpp b/src/IO/Archives/ZipArchiveReader.cpp index 636042ec586..63fdf5fe190 100644 --- a/src/IO/Archives/ZipArchiveReader.cpp +++ b/src/IO/Archives/ZipArchiveReader.cpp @@ -589,6 +589,15 @@ std::unique_ptr ZipArchiveReader::nextFile(std return std::make_unique(std::move(handle)); } +std::unique_ptr ZipArchiveReader::currentFile(std::unique_ptr read_buffer) +{ + if (!dynamic_cast(read_buffer.get())) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Wrong ReadBuffer passed to nextFile()"); + auto read_buffer_from_zip = std::unique_ptr(static_cast(read_buffer.release())); + auto handle = std::move(*read_buffer_from_zip).releaseHandle(); + return std::make_unique(std::move(handle)); +} + std::vector ZipArchiveReader::getAllFiles() { return getAllFiles({}); diff --git a/src/IO/Archives/ZipArchiveReader.h b/src/IO/Archives/ZipArchiveReader.h index a8788064fec..4b1910839eb 100644 --- a/src/IO/Archives/ZipArchiveReader.h +++ b/src/IO/Archives/ZipArchiveReader.h @@ -47,6 +47,7 @@ public: /// It's possible to convert a file enumerator to a read buffer and vice versa. std::unique_ptr readFile(std::unique_ptr enumerator) override; std::unique_ptr nextFile(std::unique_ptr read_buffer) override; + std::unique_ptr currentFile(std::unique_ptr read_buffer) override; std::vector getAllFiles() override; std::vector getAllFiles(NameFilter filter) override; diff --git a/src/Processors/Formats/ISchemaReader.cpp b/src/Processors/Formats/ISchemaReader.cpp index 26c632b83dc..c5c6ba84d9a 100644 --- a/src/Processors/Formats/ISchemaReader.cpp +++ b/src/Processors/Formats/ISchemaReader.cpp @@ -81,7 +81,7 @@ IIRowSchemaReader::IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & fo { } -void IIRowSchemaReader::setContext(ContextPtr & context) +void IIRowSchemaReader::setContext(const ContextPtr & context) { ColumnsDescription columns; if (tryParseColumnsListFromString(hints_str, columns, context, hints_parsing_error)) diff --git a/src/Processors/Formats/ISchemaReader.h b/src/Processors/Formats/ISchemaReader.h index 94df71a88b4..23c6606a6bd 100644 --- a/src/Processors/Formats/ISchemaReader.h +++ b/src/Processors/Formats/ISchemaReader.h @@ -34,7 +34,7 @@ public: virtual bool hasStrictOrderOfColumns() const { return true; } virtual bool needContext() const { return false; } - virtual void setContext(ContextPtr &) {} + virtual void setContext(const ContextPtr &) {} virtual void setMaxRowsAndBytesToRead(size_t, size_t) {} virtual size_t getNumRowsRead() const { return 0; } @@ -56,7 +56,7 @@ public: IIRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, DataTypePtr default_type_ = nullptr); bool needContext() const override { return !hints_str.empty(); } - void setContext(ContextPtr & context) override; + void setContext(const ContextPtr & context) override; protected: void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp index 53cb5a77898..62d33d36206 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.cpp @@ -215,7 +215,7 @@ JSONColumnsSchemaReaderBase::JSONColumnsSchemaReaderBase( { } -void JSONColumnsSchemaReaderBase::setContext(ContextPtr & ctx) +void JSONColumnsSchemaReaderBase::setContext(const ContextPtr & ctx) { ColumnsDescription columns; if (tryParseColumnsListFromString(hints_str, columns, ctx, hints_parsing_error)) diff --git a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h index fe80d77cd87..ee7e79afc54 100644 --- a/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h +++ b/src/Processors/Formats/Impl/JSONColumnsBlockInputFormatBase.h @@ -84,7 +84,7 @@ public: void transformTypesFromDifferentFilesIfNeeded(DataTypePtr & type, DataTypePtr & new_type) override; bool needContext() const override { return !hints_str.empty(); } - void setContext(ContextPtr & ctx) override; + void setContext(const ContextPtr & ctx) override; void setMaxRowsAndBytesToRead(size_t max_rows, size_t max_bytes) override { diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp index f78ce530ecb..7283eb1330f 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.cpp @@ -70,27 +70,36 @@ void JSONRowInputFormat::resetReadBuffer() JSONEachRowRowInputFormat::resetReadBuffer(); } -JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_) - : JSONRowSchemaReader(std::make_unique(in_), format_settings_) +JSONRowSchemaReader::JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool fallback_to_json_each_row_) + : JSONRowSchemaReader(std::make_unique(in_), format_settings_, fallback_to_json_each_row_) { } -JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr buf, const DB::FormatSettings & format_settings_) - : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf)) +JSONRowSchemaReader::JSONRowSchemaReader(std::unique_ptr buf, const DB::FormatSettings & format_settings_, bool fallback_to_json_each_row_) + : JSONEachRowSchemaReader(*buf, format_settings_), peekable_buf(std::move(buf)), fallback_to_json_each_row(fallback_to_json_each_row_) { } NamesAndTypesList JSONRowSchemaReader::readSchema() { skipBOMIfExists(*peekable_buf); - PeekableReadBufferCheckpoint checkpoint(*peekable_buf); - /// Try to parse metadata, if failed, try to parse data as JSONEachRow format - NamesAndTypesList names_and_types; - if (JSONUtils::checkAndSkipObjectStart(*peekable_buf) && JSONUtils::tryReadMetadata(*peekable_buf, names_and_types)) - return names_and_types; - peekable_buf->rollbackToCheckpoint(true); - return JSONEachRowSchemaReader::readSchema(); + if (fallback_to_json_each_row) + { + PeekableReadBufferCheckpoint checkpoint(*peekable_buf); + /// Try to parse metadata, if failed, try to parse data as JSONEachRow format + NamesAndTypesList names_and_types; + if (JSONUtils::checkAndSkipObjectStart(*peekable_buf) && JSONUtils::tryReadMetadata(*peekable_buf, names_and_types)) + return names_and_types; + + peekable_buf->rollbackToCheckpoint(true); + return JSONEachRowSchemaReader::readSchema(); + } + else + { + JSONUtils::skipObjectStart(*peekable_buf); + return JSONUtils::readMetadata(*peekable_buf); + } } void registerInputFormatJSON(FormatFactory & factory) @@ -109,19 +118,19 @@ void registerInputFormatJSON(FormatFactory & factory) void registerJSONSchemaReader(FormatFactory & factory) { - auto register_schema_reader = [&](const String & format) + auto register_schema_reader = [&](const String & format, bool fallback_to_json_each_row) { factory.registerSchemaReader( - format, [](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique(buf, format_settings); }); + format, [fallback_to_json_each_row](ReadBuffer & buf, const FormatSettings & format_settings) { return std::make_unique(buf, format_settings, fallback_to_json_each_row); }); factory.registerAdditionalInfoForSchemaCacheGetter(format, [](const FormatSettings & settings) { return getAdditionalFormatInfoByEscapingRule(settings, FormatSettings::EscapingRule::JSON); }); }; - register_schema_reader("JSON"); + register_schema_reader("JSON", true); /// JSONCompact has the same suffix with metadata. - register_schema_reader("JSONCompact"); + register_schema_reader("JSONCompact", false); } } diff --git a/src/Processors/Formats/Impl/JSONRowInputFormat.h b/src/Processors/Formats/Impl/JSONRowInputFormat.h index b2e1d8a3d6d..6db5cee380a 100644 --- a/src/Processors/Formats/Impl/JSONRowInputFormat.h +++ b/src/Processors/Formats/Impl/JSONRowInputFormat.h @@ -45,16 +45,17 @@ private: class JSONRowSchemaReader : public JSONEachRowSchemaReader { public: - JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_); + JSONRowSchemaReader(ReadBuffer & in_, const FormatSettings & format_settings_, bool fallback_to_json_each_row_); NamesAndTypesList readSchema() override; bool hasStrictOrderOfColumns() const override { return false; } private: - JSONRowSchemaReader(std::unique_ptr buf, const FormatSettings & format_settings_); + JSONRowSchemaReader(std::unique_ptr buf, const FormatSettings & format_settings_, bool fallback_to_json_each_row_); std::unique_ptr peekable_buf; + bool fallback_to_json_each_row; }; } diff --git a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp index a6e4600d83b..f5edfb7c9d4 100644 --- a/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateRowInputFormat.cpp @@ -609,7 +609,9 @@ void registerTemplateSchemaReader(FormatFactory & factory) { size_t index = 0; auto idx_getter = [&](const String &) -> std::optional { return index++; }; - auto row_format = fillRowFormat(settings, idx_getter, false); + ParsedTemplateFormatString row_format; + if (!settings.template_settings.row_format.empty()) + row_format = fillRowFormat(settings, idx_getter, false); std::unordered_set visited_escaping_rules; String result = fmt::format("row_format={}, resultset_format={}, row_between_delimiter={}", settings.template_settings.row_format, diff --git a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h index bf2765bfd1e..f82a8c8ab64 100644 --- a/src/Processors/Formats/Impl/ValuesBlockInputFormat.h +++ b/src/Processors/Formats/Impl/ValuesBlockInputFormat.h @@ -37,7 +37,7 @@ public: void resetReadBuffer() override; /// TODO: remove context somehow. - void setContext(ContextPtr & context_) { context = Context::createCopy(context_); } + void setContext(const ContextPtr & context_) { context = Context::createCopy(context_); } const BlockMissingValues & getMissingValues() const override { return block_missing_values; } diff --git a/src/Server/TCPHandler.cpp b/src/Server/TCPHandler.cpp index fa7206eeaac..8120667916e 100644 --- a/src/Server/TCPHandler.cpp +++ b/src/Server/TCPHandler.cpp @@ -35,7 +35,6 @@ #include #include #include -#include #include #include #include diff --git a/src/Storages/DataLakes/IStorageDataLake.h b/src/Storages/DataLakes/IStorageDataLake.h index 77a22cd00fc..72b182ad1f4 100644 --- a/src/Storages/DataLakes/IStorageDataLake.h +++ b/src/Storages/DataLakes/IStorageDataLake.h @@ -38,25 +38,25 @@ public: static ColumnsDescription getTableStructureFromData( Configuration & base_configuration, const std::optional & format_settings, - ContextPtr local_context) + const ContextPtr & local_context) { auto configuration = getConfigurationForDataRead(base_configuration, local_context); return Storage::getTableStructureFromData(configuration, format_settings, local_context); } - static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) + static Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context) { return Storage::getConfiguration(engine_args, local_context, /* get_format_from_file */false); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); return Storage::getConfiguration(); } - void updateConfiguration(ContextPtr local_context) override + void updateConfiguration(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); @@ -64,7 +64,7 @@ public: private: static Configuration getConfigurationForDataRead( - const Configuration & base_configuration, ContextPtr local_context, const Strings & keys = {}) + const Configuration & base_configuration, const ContextPtr & local_context, const Strings & keys = {}) { auto configuration{base_configuration}; configuration.update(local_context); @@ -84,12 +84,12 @@ private: return configuration; } - static Strings getDataFiles(const Configuration & configuration, ContextPtr local_context) + static Strings getDataFiles(const Configuration & configuration, const ContextPtr & local_context) { return MetadataParser().getFiles(configuration, local_context); } - void updateConfigurationImpl(ContextPtr local_context) + void updateConfigurationImpl(const ContextPtr & local_context) { const bool updated = base_configuration.update(local_context); auto new_keys = getDataFiles(base_configuration, local_context); diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp index 20ac77976cb..faef21d6c72 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.cpp @@ -40,7 +40,7 @@ StorageIceberg::StorageIceberg( ColumnsDescription StorageIceberg::getTableStructureFromData( Configuration & base_configuration, const std::optional &, - ContextPtr local_context) + const ContextPtr & local_context) { auto configuration{base_configuration}; configuration.update(local_context); @@ -48,7 +48,7 @@ ColumnsDescription StorageIceberg::getTableStructureFromData( return ColumnsDescription(metadata->getTableSchema()); } -void StorageIceberg::updateConfigurationImpl(ContextPtr local_context) +void StorageIceberg::updateConfigurationImpl(const ContextPtr & local_context) { const bool updated = base_configuration.update(local_context); auto new_metadata = parseIcebergMetadata(base_configuration, local_context); diff --git a/src/Storages/DataLakes/Iceberg/StorageIceberg.h b/src/Storages/DataLakes/Iceberg/StorageIceberg.h index a18865b5a54..0b346ef0175 100644 --- a/src/Storages/DataLakes/Iceberg/StorageIceberg.h +++ b/src/Storages/DataLakes/Iceberg/StorageIceberg.h @@ -51,28 +51,28 @@ public: static ColumnsDescription getTableStructureFromData( Configuration & base_configuration, const std::optional &, - ContextPtr local_context); + const ContextPtr & local_context); static Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context) { return StorageS3::getConfiguration(engine_args, local_context, /* get_format_from_file */false); } - Configuration updateConfigurationAndGetCopy(ContextPtr local_context) override + Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); return StorageS3::getConfiguration(); } - void updateConfiguration(ContextPtr local_context) override + void updateConfiguration(const ContextPtr & local_context) override { std::lock_guard lock(configuration_update_mutex); updateConfigurationImpl(local_context); } private: - void updateConfigurationImpl(ContextPtr local_context); + void updateConfigurationImpl(const ContextPtr & local_context); std::unique_ptr current_metadata; Configuration base_configuration; diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index 1e26f1be72c..a846e9fd9ef 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -67,6 +67,7 @@ namespace ErrorCodes extern const int BAD_ARGUMENTS; extern const int LOGICAL_ERROR; extern const int CANNOT_COMPILE_REGEXP; + extern const int CANNOT_DETECT_FORMAT; } namespace { @@ -194,7 +195,7 @@ StorageHDFS::StorageHDFS( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const bool distributed_processing_, ASTPtr partition_by_) @@ -206,7 +207,8 @@ StorageHDFS::StorageHDFS( , distributed_processing(distributed_processing_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(format_name); + if (format_name != "auto") + FormatFactory::instance().checkFormatName(format_name); context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); checkHDFSURL(uri_); @@ -217,11 +219,19 @@ StorageHDFS::StorageHDFS( if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri_, compression_method, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri_, compression_method_, context_); + else + columns = getTableStructureFromData(format_name, uri_, compression_method, context_); + storage_metadata.setColumns(columns); } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromData(uri_, compression_method_, context_).second; + /// We don't allow special columns in HDFS storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine HDFS doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -243,25 +253,25 @@ namespace ReadBufferIterator( const std::vector & paths_with_info_, const String & uri_without_path_, - const String & format_, + std::optional format_, const String & compression_method_, const ContextPtr & context_) : WithContext(context_) , paths_with_info(paths_with_info_) , uri_without_path(uri_without_path_) - , format(format_) + , format(std::move(format_)) , compression_method(compression_method_) { } - std::pair, std::optional> next() override + Data next() override { bool is_first = current_index == 0; /// For default mode check cached columns for all paths on first iteration. if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) { if (auto cached_columns = tryGetColumnsFromCache(paths_with_info)) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } StorageHDFS::PathWithInfo path_with_info; @@ -271,10 +281,17 @@ namespace if (current_index == paths_with_info.size()) { if (is_first) - throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", format); - return {nullptr, std::nullopt}; + { + if (format) + throw Exception(ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. " + "You can specify table structure manually", *format); + + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); + } + return {nullptr, std::nullopt, format}; } path_with_info = paths_with_info[current_index++]; @@ -285,7 +302,7 @@ namespace { std::vector paths = {path_with_info}; if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } auto compression = chooseCompressionMethod(path_with_info.path, compression_method); @@ -293,7 +310,7 @@ namespace if (!getContext()->getSettingsRef().hdfs_skip_empty_files || !impl->eof()) { const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; - return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt}; + return {wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)), std::nullopt, format}; } } } @@ -304,7 +321,7 @@ namespace return; String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); + auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); StorageHDFS::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -315,7 +332,7 @@ namespace return; String source = uri_without_path + paths_with_info[current_index - 1].path; - auto key = getKeyForSchemaCache(source, format, std::nullopt, getContext()); + auto key = getKeyForSchemaCache(source, *format, std::nullopt, getContext()); StorageHDFS::getSchemaCache(getContext()).addColumns(key, columns); } @@ -328,10 +345,15 @@ namespace Strings sources; sources.reserve(paths_with_info.size()); std::transform(paths_with_info.begin(), paths_with_info.end(), std::back_inserter(sources), [&](const StorageHDFS::PathWithInfo & path_with_info){ return uri_without_path + path_with_info.path; }); - auto cache_keys = getKeysForSchemaCache(sources, format, {}, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, {}, getContext()); StorageHDFS::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { if (current_index != 0) @@ -340,13 +362,27 @@ namespace return ""; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= paths_with_info.size()); + auto path_with_info = paths_with_info[current_index - 1]; + auto compression = chooseCompressionMethod(path_with_info.path, compression_method); + auto impl = std::make_unique(uri_without_path, path_with_info.path, getContext()->getGlobalContext()->getConfigRef(), getContext()->getReadSettings()); + const Int64 zstd_window_log_max = getContext()->getSettingsRef().zstd_window_log_max; + return wrapReadBufferWithCompressionMethod(std::move(impl), compression, static_cast(zstd_window_log_max)); + } + private: std::optional tryGetColumnsFromCache(const std::vector & paths_with_info_) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_hdfs) + auto context = getContext(); + + if (!context->getSettingsRef().schema_inference_use_cache_for_hdfs) return std::nullopt; - auto & schema_cache = StorageHDFS::getSchemaCache(getContext()); + auto & schema_cache = StorageHDFS::getSchemaCache(context); for (const auto & path_with_info : paths_with_info_) { auto get_last_mod_time = [&]() -> std::optional @@ -354,7 +390,7 @@ namespace if (path_with_info.info) return path_with_info.info->last_mod_time; - auto builder = createHDFSBuilder(uri_without_path + "/", getContext()->getGlobalContext()->getConfigRef()); + auto builder = createHDFSBuilder(uri_without_path + "/", context->getGlobalContext()->getConfigRef()); auto fs = createHDFSFS(builder.get()); HDFSFileInfoPtr hdfs_info(hdfsGetPathInfo(fs.get(), path_with_info.path.c_str())); if (hdfs_info) @@ -364,10 +400,28 @@ namespace }; String url = uri_without_path + path_with_info.path; - auto cache_key = getKeyForSchemaCache(url, format, {}, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(url, *format, {}, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(url, format_name, {}, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -375,29 +429,49 @@ namespace const std::vector & paths_with_info; const String & uri_without_path; - const String & format; + std::optional format; const String & compression_method; size_t current_index = 0; }; } -ColumnsDescription StorageHDFS::getTableStructureFromData( - const String & format, +std::pair StorageHDFS::getTableStructureAndFormatFromDataImpl( + std::optional format, const String & uri, const String & compression_method, - ContextPtr ctx) + const ContextPtr & ctx) { const auto [path_from_uri, uri_without_path] = getPathFromUriAndUriWithoutPath(uri); auto paths_with_info = getPathsList(path_from_uri, uri, ctx); - if (paths_with_info.empty() && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + if (paths_with_info.empty() && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files in HDFS with provided path." + " You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files in HDFS with provided path." - " You must specify table structure manually", format); + "The data format cannot be detected by the contents of the files, because there are no files in HDFS with provided path." + " You can specify the format manually"); + } ReadBufferIterator read_buffer_iterator(paths_with_info, uri_without_path, format, compression_method, ctx); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, paths_with_info.size() > 1, ctx); + if (format) + return {readSchemaFromFormat(*format, std::nullopt, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, ctx); +} + +std::pair StorageHDFS::getTableStructureAndFormatFromData(const String & uri, const String & compression_method, const ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, ctx); +} + +ColumnsDescription StorageHDFS::getTableStructureFromData(const String & format, const String & uri, const String & compression_method, const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, ctx).first; } class HDFSSource::DisclosedGlobIterator::Impl @@ -533,7 +607,7 @@ StorageHDFS::PathWithInfo HDFSSource::URISIterator::next() HDFSSource::HDFSSource( const ReadFromFormatInfo & info, StorageHDFSPtr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_) @@ -712,7 +786,7 @@ public: HDFSSink(const String & uri, const String & format, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const CompressionMethod compression_method) : SinkToStorage(sample_block) { @@ -1073,7 +1147,7 @@ void registerStorageHDFS(StorageFactory & factory) } if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url, true); + format_name = FormatFactory::instance().getFormatFromFileName(url); String compression_method; if (engine_args.size() == 3) diff --git a/src/Storages/HDFS/StorageHDFS.h b/src/Storages/HDFS/StorageHDFS.h index f1f0019d3e0..1edbf2b77ce 100644 --- a/src/Storages/HDFS/StorageHDFS.h +++ b/src/Storages/HDFS/StorageHDFS.h @@ -44,7 +44,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_ = "", bool distributed_processing_ = false, ASTPtr partition_by = nullptr); @@ -86,7 +86,12 @@ public: const String & format, const String & uri, const String & compression_method, - ContextPtr ctx); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + const String & uri, + const String & compression_method, + const ContextPtr & ctx); static SchemaCache & getSchemaCache(const ContextPtr & ctx); @@ -97,6 +102,12 @@ protected: friend class ReadFromHDFS; private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + const String & uri, + const String & compression_method, + const ContextPtr & ctx); + std::vector uris; String format_name; String compression_method; @@ -141,7 +152,7 @@ public: HDFSSource( const ReadFromFormatInfo & info, StorageHDFSPtr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, std::shared_ptr file_iterator_, bool need_only_count_); diff --git a/src/Storages/HDFS/StorageHDFSCluster.cpp b/src/Storages/HDFS/StorageHDFSCluster.cpp index 2e8129b9845..a1e03926520 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.cpp +++ b/src/Storages/HDFS/StorageHDFSCluster.cpp @@ -43,12 +43,10 @@ StorageHDFSCluster::StorageHDFSCluster( const String & format_name_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageHDFSCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const String & compression_method) + : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageHDFSCluster (" + table_id_.table_name + ")")) , uri(uri_) , format_name(format_name_) - , compression_method(compression_method_) { checkHDFSURL(uri_); context_->getRemoteHostFilter().checkURL(Poco::URI(uri_)); @@ -57,11 +55,20 @@ StorageHDFSCluster::StorageHDFSCluster( if (columns_.empty()) { - auto columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageHDFS::getTableStructureAndFormatFromData(uri_, compression_method, context_); + else + columns = StorageHDFS::getTableStructureFromData(format_name, uri_, compression_method, context_); storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageHDFS::getTableStructureAndFormatFromData(uri_, compression_method, context_).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -69,13 +76,14 @@ StorageHDFSCluster::StorageHDFSCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageHDFSCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageHDFSCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function hdfsCluster, got '{}'", queryToString(query)); - TableFunctionHDFSCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionHDFSCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } diff --git a/src/Storages/HDFS/StorageHDFSCluster.h b/src/Storages/HDFS/StorageHDFSCluster.h index 7c4c41a573a..40884f98984 100644 --- a/src/Storages/HDFS/StorageHDFSCluster.h +++ b/src/Storages/HDFS/StorageHDFSCluster.h @@ -28,8 +28,7 @@ public: const String & format_name_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const String & compression_method_, - bool structure_argument_was_provided_); + const String & compression_method); std::string getName() const override { return "HDFSCluster"; } @@ -42,11 +41,10 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; String uri; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/IStorageCluster.cpp b/src/Storages/IStorageCluster.cpp index 6f42d8f855c..348e37fc72c 100644 --- a/src/Storages/IStorageCluster.cpp +++ b/src/Storages/IStorageCluster.cpp @@ -32,12 +32,10 @@ namespace DB IStorageCluster::IStorageCluster( const String & cluster_name_, const StorageID & table_id_, - Poco::Logger * log_, - bool structure_argument_was_provided_) + Poco::Logger * log_) : IStorage(table_id_) , log(log_) , cluster_name(cluster_name_) - , structure_argument_was_provided(structure_argument_was_provided_) { } @@ -130,8 +128,7 @@ void IStorageCluster::read( query_to_send = interpreter.getQueryInfo().query->clone(); } - if (!structure_argument_was_provided) - addColumnsStructureToQuery(query_to_send, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), context); + updateQueryToSendIfNeeded(query_to_send, storage_snapshot, context); RestoreQualifiedNamesVisitor::Data data; data.distributed_table = DatabaseAndTableWithAlias(*getTableExpression(query_info.query->as(), 0)); diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h index b233f20103d..28ebda5125e 100644 --- a/src/Storages/IStorageCluster.h +++ b/src/Storages/IStorageCluster.h @@ -19,8 +19,7 @@ public: IStorageCluster( const String & cluster_name_, const StorageID & table_id_, - Poco::Logger * log_, - bool structure_argument_was_provided_); + Poco::Logger * log_); void read( QueryPlan & query_plan, @@ -42,13 +41,11 @@ public: protected: virtual void updateBeforeRead(const ContextPtr &) {} - - virtual void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) = 0; + virtual void updateQueryToSendIfNeeded(ASTPtr & /*query*/, const StorageSnapshotPtr & /*storage_snapshot*/, const ContextPtr & /*context*/) {} private: Poco::Logger * log; String cluster_name; - bool structure_argument_was_provided; }; diff --git a/src/Storages/S3Queue/StorageS3Queue.cpp b/src/Storages/S3Queue/StorageS3Queue.cpp index bc33e8cf2a9..098d279e482 100644 --- a/src/Storages/S3Queue/StorageS3Queue.cpp +++ b/src/Storages/S3Queue/StorageS3Queue.cpp @@ -143,11 +143,17 @@ StorageS3Queue::StorageS3Queue( StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageS3::getTableStructureFromDataImpl(configuration, format_settings, context_); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_); + else + columns = StorageS3::getTableStructureFromData(configuration, format_settings, context_); storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = StorageS3::getTableStructureAndFormatFromData(configuration, format_settings, context_).second; storage_metadata.setColumns(columns_); } diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index cd841a1a673..888d360aff1 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -65,6 +65,7 @@ namespace ErrorCodes extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_COMPILE_REGEXP; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int LOGICAL_ERROR; extern const int NOT_IMPLEMENTED; @@ -127,7 +128,7 @@ void StorageAzureBlob::processNamedCollectionResult(StorageAzureBlob::Configurat } -StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, ContextPtr local_context) +StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine_args, const ContextPtr & local_context) { StorageAzureBlob::Configuration configuration; @@ -143,7 +144,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); return configuration; } @@ -236,13 +237,13 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); return configuration; } -AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(ContextPtr local_context) +AzureObjectStorage::SettingsPtr StorageAzureBlob::createSettings(const ContextPtr & local_context) { const auto & context_settings = local_context->getSettingsRef(); auto settings_ptr = std::make_unique(); @@ -447,7 +448,7 @@ Poco::URI StorageAzureBlob::Configuration::getConnectionURL() const StorageAzureBlob::StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context, + const ContextPtr & context, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -463,17 +464,25 @@ StorageAzureBlob::StorageAzureBlob( , format_settings(format_settings_) , partition_by(partition_by_) { - FormatFactory::instance().checkFormatName(configuration.format); + if (configuration.format != "auto") + FormatFactory::instance().checkFormatName(configuration.format); context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context, distributed_processing); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = getTableStructureAndFormatFromData(object_storage.get(), configuration, format_settings, context); + else + columns = getTableStructureFromData(object_storage.get(), configuration, format_settings, context); storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = getTableStructureAndFormatFromData(object_storage.get(), configuration, format_settings, context).second; + /// We don't allow special columns in File storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine AzureBlobStorage doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -517,7 +526,7 @@ public: StorageAzureBlobSink( const String & format, const Block & sample_block_, - ContextPtr context, + const ContextPtr & context, std::optional format_settings_, const CompressionMethod compression_method, AzureObjectStorage * object_storage, @@ -607,22 +616,21 @@ private: std::mutex cancel_mutex; }; -class PartitionedStorageAzureBlobSink : public PartitionedSink +class PartitionedStorageAzureBlobSink : public PartitionedSink, WithContext { public: PartitionedStorageAzureBlobSink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, const CompressionMethod compression_method_, AzureObjectStorage * object_storage_, const String & blob_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_), WithContext(context_) , format(format_) , sample_block(sample_block_) - , context(context_) , compression_method(compression_method_) , object_storage(object_storage_) , blob(blob_) @@ -638,7 +646,7 @@ public: return std::make_shared( format, sample_block, - context, + getContext(), format_settings, compression_method, object_storage, @@ -649,7 +657,6 @@ public: private: const String format; const Block sample_block; - const ContextPtr context; const CompressionMethod compression_method; AzureObjectStorage * object_storage; const String blob; @@ -913,7 +920,7 @@ StorageAzureBlobSource::GlobIterator::GlobIterator( String blob_path_with_globs_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs_, std::function file_progress_callback_) : IIterator(context_) @@ -1028,7 +1035,7 @@ StorageAzureBlobSource::KeysIterator::KeysIterator( const Strings & keys_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs, std::function file_progress_callback) : IIterator(context_) @@ -1147,7 +1154,7 @@ StorageAzureBlobSource::StorageAzureBlobSource( const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, String compression_hint_, @@ -1290,6 +1297,7 @@ namespace ReadBufferIterator( const std::shared_ptr & file_iterator_, AzureObjectStorage * object_storage_, + std::optional format_, const StorageAzureBlob::Configuration & configuration_, const std::optional & format_settings_, const RelativePathsWithMetadata & read_keys_, @@ -1298,19 +1306,20 @@ namespace , file_iterator(file_iterator_) , object_storage(object_storage_) , configuration(configuration_) + , format(std::move(format_)) , format_settings(format_settings_) , read_keys(read_keys_) , prev_read_keys_size(read_keys_.size()) { } - std::pair, std::optional> next() override + Data next() override { /// For default mode check cached columns for currently read keys on first iteration. if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) { if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } current_path_with_metadata = file_iterator->next(); @@ -1318,12 +1327,20 @@ namespace if (current_path_with_metadata.relative_path.empty()) { if (first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in AzureBlobStorage. You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in AzureBlobStorage. You must specify table structure manually", configuration.format); + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in AzureBlobStorage. You can specify table structure manually"); + } - return {nullptr, std::nullopt}; + return {nullptr, std::nullopt, format}; } first = false; @@ -1334,13 +1351,13 @@ namespace auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); prev_read_keys_size = read_keys.size(); if (columns_from_cache) - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { RelativePathsWithMetadata paths = {current_path_with_metadata}; if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } first = false; @@ -1348,7 +1365,7 @@ namespace return {wrapReadBufferWithCompressionMethod( object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), - zstd_window_log_max), std::nullopt}; + zstd_window_log_max), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -1357,7 +1374,7 @@ namespace return; String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -1368,7 +1385,7 @@ namespace return; String source = fs::path(configuration.connection_url) / configuration.container / current_path_with_metadata.relative_path; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addColumns(key, columns); } @@ -1382,16 +1399,36 @@ namespace Strings sources; sources.reserve(read_keys.size()); std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket + '/' + elem.relative_path; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, format_settings, getContext()); StorageAzureBlob::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return current_path_with_metadata.relative_path; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + return wrapReadBufferWithCompressionMethod( + object_storage->readObject(StoredObject(current_path_with_metadata.relative_path), getContext()->getReadSettings(), {}, current_path_with_metadata.metadata.size_bytes), + chooseCompressionMethod(current_path_with_metadata.relative_path, configuration.compression_method), + zstd_window_log_max); + } + private: std::optional tryGetColumnsFromCache(const RelativePathsWithMetadata::const_iterator & begin, const RelativePathsWithMetadata::const_iterator & end) { - auto & schema_cache = StorageAzureBlob::getSchemaCache(getContext()); + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_azure) + return std::nullopt; + + auto & schema_cache = StorageAzureBlob::getSchemaCache(context); for (auto it = begin; it < end; ++it) { auto get_last_mod_time = [&] -> std::optional @@ -1403,10 +1440,28 @@ namespace auto host_and_bucket = configuration.connection_url + '/' + configuration.container; String source = host_and_bucket + '/' + it->relative_path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(source, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -1415,6 +1470,7 @@ namespace std::shared_ptr file_iterator; AzureObjectStorage * object_storage; const StorageAzureBlob::Configuration & configuration; + std::optional format; const std::optional & format_settings; const RelativePathsWithMetadata & read_keys; size_t prev_read_keys_size; @@ -1423,21 +1479,16 @@ namespace }; } -ColumnsDescription StorageAzureBlob::getTableStructureFromData( +std::pair StorageAzureBlob::getTableStructureAndFormatFromDataImpl( + std::optional format, AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing) + const ContextPtr & ctx) { RelativePathsWithMetadata read_keys; std::shared_ptr file_iterator; - if (distributed_processing) - { - file_iterator = std::make_shared(ctx, - ctx->getReadTaskCallback()); - } - else if (configuration.withGlobs()) + if (configuration.withGlobs()) { file_iterator = std::make_shared( object_storage, configuration.container, configuration.blob_path, nullptr, NamesAndTypesList{}, ctx, &read_keys); @@ -1448,8 +1499,28 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData( object_storage, configuration.container, configuration.blobs_paths, nullptr, NamesAndTypesList{}, ctx, &read_keys); } - ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, configuration, format_settings, read_keys, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + ReadBufferIterator read_buffer_iterator(file_iterator, object_storage, format, configuration, format_settings, read_keys, ctx); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, ctx); +} + +std::pair StorageAzureBlob::getTableStructureAndFormatFromData( + DB::AzureObjectStorage * object_storage, + const DB::StorageAzureBlob::Configuration & configuration, + const std::optional & format_settings, + const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, object_storage, configuration, format_settings, ctx); +} + +ColumnsDescription StorageAzureBlob::getTableStructureFromData( + DB::AzureObjectStorage * object_storage, + const DB::StorageAzureBlob::Configuration & configuration, + const std::optional & format_settings, + const DB::ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, object_storage, configuration, format_settings, ctx).first; } SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) diff --git a/src/Storages/StorageAzureBlob.h b/src/Storages/StorageAzureBlob.h index 16e5b9edfb6..71c93021dd4 100644 --- a/src/Storages/StorageAzureBlob.h +++ b/src/Storages/StorageAzureBlob.h @@ -31,9 +31,9 @@ public: String getPath() const { return blob_path; } - bool update(ContextPtr context); + bool update(const ContextPtr & context); - void connect(ContextPtr context); + void connect(const ContextPtr & context); bool withGlobs() const { return blob_path.find_first_of("*?{") != std::string::npos; } @@ -59,7 +59,7 @@ public: StorageAzureBlob( const Configuration & configuration_, std::unique_ptr && object_storage_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -68,10 +68,10 @@ public: bool distributed_processing_, ASTPtr partition_by_); - static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context); + static StorageAzureBlob::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context); static AzureClientPtr createClient(StorageAzureBlob::Configuration configuration, bool is_read_only); - static AzureObjectStorage::SettingsPtr createSettings(ContextPtr local_context); + static AzureObjectStorage::SettingsPtr createSettings(const ContextPtr & local_context); static void processNamedCollectionResult(StorageAzureBlob::Configuration & configuration, const NamedCollection & collection); @@ -115,10 +115,22 @@ public: AzureObjectStorage * object_storage, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx, - bool distributed_processing = false); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + AzureObjectStorage * object_storage, + const Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); + friend class ReadFromAzureBlob; std::string name; @@ -137,7 +149,7 @@ public: class IIterator : public WithContext { public: - IIterator(ContextPtr context_):WithContext(context_) {} + IIterator(const ContextPtr & context_):WithContext(context_) {} virtual ~IIterator() = default; virtual RelativePathWithMetadata next() = 0; @@ -153,7 +165,7 @@ public: String blob_path_with_globs_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs_, std::function file_progress_callback_ = {}); @@ -186,7 +198,7 @@ public: class ReadIterator : public IIterator { public: - explicit ReadIterator(ContextPtr context_, + explicit ReadIterator(const ContextPtr & context_, const ReadTaskCallback & callback_) : IIterator(context_), callback(callback_) { } RelativePathWithMetadata next() override @@ -207,7 +219,7 @@ public: const Strings & keys_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context_, + const ContextPtr & context_, RelativePathsWithMetadata * outer_blobs, std::function file_progress_callback = {}); @@ -229,7 +241,7 @@ public: const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, String compression_hint_, diff --git a/src/Storages/StorageAzureBlobCluster.cpp b/src/Storages/StorageAzureBlobCluster.cpp index a6372577fb0..0f607a9812f 100644 --- a/src/Storages/StorageAzureBlobCluster.cpp +++ b/src/Storages/StorageAzureBlobCluster.cpp @@ -36,23 +36,30 @@ StorageAzureBlobCluster::StorageAzureBlobCluster( const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageAzureBlobCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ContextPtr & context) + : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageAzureBlobCluster (" + table_id_.table_name + ")")) , configuration{configuration_} , object_storage(std::move(object_storage_)) { - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.getConnectionURL()); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { + ColumnsDescription columns; /// `format_settings` is set to std::nullopt, because StorageAzureBlobCluster is used only as table function - auto columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context_, false); + if (configuration.format == "auto") + std::tie(columns, configuration.format) = StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context); + else + columns = StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context); storage_metadata.setColumns(columns); } else + { + if (configuration.format == "auto") + configuration.format = StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, /*format_settings=*/std::nullopt, context).second; storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -60,13 +67,14 @@ StorageAzureBlobCluster::StorageAzureBlobCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageAzureBlobCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageAzureBlobCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - TableFunctionAzureBlobStorageCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionAzureBlobStorageCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), configuration.format, context); } RemoteQueryExecutor::Extension StorageAzureBlobCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageAzureBlobCluster.h b/src/Storages/StorageAzureBlobCluster.h index 2831b94f825..476f21c6742 100644 --- a/src/Storages/StorageAzureBlobCluster.h +++ b/src/Storages/StorageAzureBlobCluster.h @@ -27,8 +27,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + const ContextPtr & context); std::string getName() const override { return "AzureBlobStorageCluster"; } @@ -43,7 +42,7 @@ public: private: void updateBeforeRead(const ContextPtr & /*context*/) override {} - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; StorageAzureBlob::Configuration configuration; NamesAndTypesList virtual_columns; diff --git a/src/Storages/StorageFile.cpp b/src/Storages/StorageFile.cpp index 9f864813de9..920c7069529 100644 --- a/src/Storages/StorageFile.cpp +++ b/src/Storages/StorageFile.cpp @@ -89,6 +89,7 @@ namespace ErrorCodes extern const int LOGICAL_ERROR; extern const int CANNOT_APPEND_TO_FILE; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int CANNOT_COMPILE_REGEXP; } @@ -327,7 +328,7 @@ std::unique_ptr createReadBuffer( } -Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read) +Strings StorageFile::getPathsList(const String & table_path, const String & user_files_path, const ContextPtr & context, size_t & total_bytes_to_read) { fs::path user_files_absolute_path = fs::weakly_canonical(user_files_path); fs::path fs_table_path(table_path); @@ -374,27 +375,44 @@ namespace public: ReadBufferFromFileIterator( const std::vector & paths_, - const String & format_, + std::optional format_, const String & compression_method_, const std::optional & format_settings_, - ContextPtr context_) + const ContextPtr & context_) : WithContext(context_) , paths(paths_) - , format(format_) + , format(std::move(format_)) , compression_method(compression_method_) , format_settings(format_settings_) { } - std::pair, std::optional> next() override + Data next() override { bool is_first = current_index == 0; - /// For default mode check cached columns for all paths on first iteration. - /// If we have cached columns, next() won't be called again. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (is_first) { - if (auto cached_columns = tryGetColumnsFromCache(paths)) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all paths on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & path : paths) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for all paths on first iteration. + /// If we have cached columns, next() won't be called again. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(paths)) + return {nullptr, cached_columns, format}; + } } String path; @@ -405,11 +423,18 @@ namespace if (current_index == paths.size()) { if (is_first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. You can specify the format manually", + *format); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); - return {nullptr, std::nullopt}; + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify table structure manually"); + } + return {nullptr, std::nullopt, std::nullopt}; } path = paths[current_index++]; @@ -420,10 +445,10 @@ namespace if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { if (auto cached_columns = tryGetColumnsFromCache({path})) - return {nullptr, cached_columns}; + return {nullptr, cached_columns, format}; } - return {createReadBuffer(path, file_stat, false, -1, compression_method, getContext()), std::nullopt}; + return {createReadBuffer(path, file_stat, false, -1, compression_method, getContext()), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -431,7 +456,7 @@ namespace if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return; - auto key = getKeyForSchemaCache(paths[current_index - 1], format, format_settings, getContext()); + auto key = getKeyForSchemaCache(paths[current_index - 1], *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -443,7 +468,7 @@ namespace /// For union mode, schema can be different for different files, so we need to /// cache last inferred schema only for last processed file. - auto cache_key = getKeyForSchemaCache(paths[current_index - 1], format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(paths[current_index - 1], *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addColumns(cache_key, columns); } @@ -454,7 +479,7 @@ namespace return; /// For default mode we cache resulting schema for all paths. - auto cache_keys = getKeysForSchemaCache(paths, format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(paths, *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } @@ -465,14 +490,30 @@ namespace return ""; } + void setFormatName(const String & format_name) override + { + format = format_name; + } + + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= paths.size()); + auto path = paths[current_index - 1]; + auto file_stat = getFileStat(path, false, -1, "File"); + return createReadBuffer(path, file_stat, false, -1, compression_method, getContext()); + } + private: std::optional tryGetColumnsFromCache(const Strings & paths_) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_file) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_file) return std::nullopt; /// Check if the cache contains one of the paths. - auto & schema_cache = StorageFile::getSchemaCache(getContext()); + auto & schema_cache = StorageFile::getSchemaCache(context); struct stat file_stat{}; for (const auto & path : paths_) { @@ -484,10 +525,28 @@ namespace return file_stat.st_mtime; }; - auto cache_key = getKeyForSchemaCache(path, format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(path, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(path, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -496,7 +555,7 @@ namespace const std::vector & paths; size_t current_index = 0; - String format; + std::optional format; String compression_method; const std::optional & format_settings; }; @@ -506,17 +565,17 @@ namespace public: ReadBufferFromArchiveIterator( const StorageFile::ArchiveInfo & archive_info_, - const String & format_, + std::optional format_, const std::optional & format_settings_, - ContextPtr context_) + const ContextPtr & context_) : WithContext(context_) , archive_info(archive_info_) - , format(format_) + , format(std::move(format_)) , format_settings(format_settings_) { } - std::pair, std::optional> next() override + Data next() override { /// For default mode check cached columns for all initial archive paths (maybe with globs) on first iteration. /// If we have cached columns, next() won't be called again. @@ -524,8 +583,8 @@ namespace { for (const auto & archive : archive_info.paths_to_archives) { - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, archive_info.path_in_archive)) - return {nullptr, cached_columns}; + if (auto cached_schema = tryGetSchemaFromCache(archive, fmt::format("{}::{}", archive, archive_info.path_in_archive))) + return {nullptr, cached_schema, format}; } } @@ -535,12 +594,19 @@ namespace if (current_archive_index == archive_info.paths_to_archives.size()) { if (is_first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. You must specify table structure manually", - format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. You can specify table structure manually", + *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; } const auto & archive = archive_info.paths_to_archives[current_archive_index]; @@ -554,11 +620,18 @@ namespace continue; } + if (format) + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The table structure cannot be extracted from a {} format file, because the archive {} is empty. " + "You can specify table structure manually", + *format, + archive); + throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because the archive {} is empty. " - "You must specify table structure manually", - format, + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because the archive {} is empty. " + "You can specify the format manually", archive); } @@ -574,8 +647,8 @@ namespace last_read_file_path = paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive_reader->getPath(), archive_info.path_in_archive)); is_first = false; - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, last_read_file_path)) - return {nullptr, cached_columns}; + if (auto cached_schema = tryGetSchemaFromCache(archive, last_read_file_path)) + return {nullptr, cached_schema, format}; } else { @@ -611,13 +684,20 @@ namespace last_read_file_path = paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive_reader->getPath(), *filename)); is_first = false; - if (auto cached_columns = tryGetColumnsFromSchemaCache(archive, last_read_file_path)) + /// If format is unknown we can try to determine it by the file name. + if (!format) + { + if (auto format_from_file = FormatFactory::instance().tryGetFormatFromFileName(*filename)) + format = format_from_file; + } + + if (auto cached_schema = tryGetSchemaFromCache(archive, last_read_file_path)) { /// For union mode next() will be called again even if we found cached columns, /// so we need to remember last_read_buffer to continue iterating through files in archive. if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) last_read_buffer = archive_reader->readFile(std::move(file_enumerator)); - return {nullptr, cached_columns}; + return {nullptr, cached_schema, format}; } read_buf = archive_reader->readFile(std::move(file_enumerator)); @@ -626,7 +706,7 @@ namespace break; } - return {std::move(read_buf), std::nullopt}; + return {std::move(read_buf), std::nullopt, format}; } void setPreviousReadBuffer(std::unique_ptr buffer) override @@ -640,7 +720,7 @@ namespace if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return; - auto key = getKeyForSchemaCache(last_read_file_path, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(last_read_file_path, *format, format_settings, getContext()); StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -653,7 +733,7 @@ namespace /// For union mode, schema can be different for different files in archive, so we need to /// cache last inferred schema only for last processed file. auto & schema_cache = StorageFile::getSchemaCache(getContext()); - auto cache_key = getKeyForSchemaCache(last_read_file_path, format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(last_read_file_path, *format, format_settings, getContext()); schema_cache.addColumns(cache_key, columns); } @@ -669,17 +749,42 @@ namespace for (const auto & archive : archive_info.paths_to_archives) paths_for_schema_cache.emplace_back(fmt::format("{}::{}", archive, archive_info.path_in_archive)); auto & schema_cache = StorageFile::getSchemaCache(getContext()); - auto cache_keys = getKeysForSchemaCache(paths_for_schema_cache, format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(paths_for_schema_cache, *format, format_settings, getContext()); schema_cache.addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return last_read_file_path; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + if (archive_info.isSingleFileRead()) + { + chassert(current_archive_index > 0 && current_archive_index <= archive_info.paths_to_archives.size()); + const auto & archive = archive_info.paths_to_archives[current_archive_index - 1]; + auto archive_reader = createArchiveReader(archive); + return archive_reader->readFile(archive_info.path_in_archive, false); + } + + chassert(current_archive_index >= 0 && current_archive_index < archive_info.paths_to_archives.size()); + const auto & archive = archive_info.paths_to_archives[current_archive_index]; + auto archive_reader = createArchiveReader(archive); + chassert(last_read_buffer); + file_enumerator = archive_reader->currentFile(std::move(last_read_buffer)); + return archive_reader->readFile(std::move(file_enumerator)); + } + private: - std::optional tryGetColumnsFromSchemaCache(const std::string & archive_path, const std::string & full_path) + std::optional tryGetSchemaFromCache(const std::string & archive_path, const std::string & full_path) { auto context = getContext(); if (!context->getSettingsRef().schema_inference_use_cache_for_file) @@ -695,11 +800,28 @@ namespace return file_stat.st_mtime; }; - auto cache_key = getKeyForSchemaCache(full_path, format, format_settings, context); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(full_path, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(full_path, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } return std::nullopt; } @@ -715,13 +837,13 @@ namespace std::unique_ptr file_enumerator; std::unique_ptr last_read_buffer; - String format; + std::optional format; const std::optional & format_settings; std::vector paths_for_schema_cache; }; } -ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr context) +std::pair StorageFile::getTableStructureAndFormatFromFileDescriptor(std::optional format, const ContextPtr & context) { /// If we want to read schema from file descriptor we should create /// a read buffer from fd, create a checkpoint, read some data required @@ -738,22 +860,29 @@ ColumnsDescription StorageFile::getTableStructureFromFileDescriptor(ContextPtr c read_buf->setCheckpoint(); auto read_buffer_iterator = SingleReadBufferIterator(std::move(read_buf)); - auto columns = readSchemaFromFormat(format_name, format_settings, read_buffer_iterator, false, context, peekable_read_buffer_from_fd); + ColumnsDescription columns; + if (format) + columns = readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context); + else + std::tie(columns, format) = detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); + + peekable_read_buffer_from_fd = read_buffer_iterator.releaseBuffer(); if (peekable_read_buffer_from_fd) { /// If we have created read buffer in readSchemaFromFormat we should rollback to checkpoint. assert_cast(peekable_read_buffer_from_fd.get())->rollbackToCheckpoint(); has_peekable_read_buffer_from_fd = true; } - return columns; + + return {columns, *format}; } -ColumnsDescription StorageFile::getTableStructureFromFile( - const String & format, +std::pair StorageFile::getTableStructureAndFormatFromFileImpl( + std::optional format, const std::vector & paths, const String & compression_method, const std::optional & format_settings, - ContextPtr context, + const ContextPtr & context, const std::optional & archive_info) { if (format == "Distributed") @@ -761,29 +890,60 @@ ColumnsDescription StorageFile::getTableStructureFromFile( if (paths.empty()) throw Exception(ErrorCodes::INCORRECT_FILE_NAME, "Cannot get table structure from file, because no files match specified name"); - return ColumnsDescription(DistributedAsyncInsertSource(paths[0]).getOutputs().front().getHeader().getNamesAndTypesList()); + return {ColumnsDescription(DistributedAsyncInsertSource(paths[0]).getOutputs().front().getHeader().getNamesAndTypesList()), *format}; } if (((archive_info && archive_info->paths_to_archives.empty()) || (!archive_info && paths.empty())) - && !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(format)) + && (!format || !FormatFactory::instance().checkIfFormatHasExternalSchemaReader(*format))) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path. " + "You can specify table structure manually", *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path. " - "You must specify table structure manually", format); + "The data format cannot be detected by the contents of the files, because there are no files with provided path. " + "You can specify the format manually"); + + } if (archive_info) { ReadBufferFromArchiveIterator read_buffer_iterator(*archive_info, format, format_settings, context); - return readSchemaFromFormat( - format, - format_settings, - read_buffer_iterator, - /*retry=*/archive_info->paths_to_archives.size() > 1 || !archive_info->isSingleFileRead(), - context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); } ReadBufferFromFileIterator read_buffer_iterator(paths, format, compression_method, format_settings, context); - return readSchemaFromFormat(format, format_settings, read_buffer_iterator, paths.size() > 1, context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); +} + +ColumnsDescription StorageFile::getTableStructureFromFile( + const DB::String & format, + const std::vector & paths, + const DB::String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info) +{ + return getTableStructureAndFormatFromFileImpl(format, paths, compression_method, format_settings, context, archive_info).first; +} + +std::pair StorageFile::getTableStructureAndFormatFromFile( + const std::vector & paths, + const DB::String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info) +{ + return getTableStructureAndFormatFromFileImpl(std::nullopt, paths, compression_method, format_settings, context, archive_info); } bool StorageFile::supportsSubsetOfColumns(const ContextPtr & context) const @@ -874,7 +1034,7 @@ StorageFile::StorageFile(CommonArguments args) , compression_method(args.compression_method) , base_path(args.getContext()->getPath()) { - if (format_name != "Distributed") + if (format_name != "Distributed" && format_name != "auto") FormatFactory::instance().checkFormatName(format_name); } @@ -886,16 +1046,19 @@ void StorageFile::setStorageMetadata(CommonArguments args) { ColumnsDescription columns; if (use_table_fd) - columns = getTableStructureFromFileDescriptor(args.getContext()); + { + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromFileDescriptor(std::nullopt, args.getContext()); + else + columns = getTableStructureAndFormatFromFileDescriptor(format_name, args.getContext()).first; + } else { - columns = getTableStructureFromFile( - format_name, - paths, - compression_method, - format_settings, - args.getContext(), - archive_info); + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromFile(paths, compression_method, format_settings, args.getContext(), archive_info); + else + columns = getTableStructureFromFile(format_name, paths, compression_method, format_settings, args.getContext(), archive_info); + if (!args.columns.empty() && args.columns != columns) throw Exception(ErrorCodes::INCOMPATIBLE_COLUMNS, "Table structure and file structure are different"); } @@ -903,6 +1066,8 @@ void StorageFile::setStorageMetadata(CommonArguments args) } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromFile(paths, compression_method, format_settings, args.getContext(), archive_info).second; /// We don't allow special columns in File storage. if (!args.columns.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine File doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -917,7 +1082,7 @@ void StorageFile::setStorageMetadata(CommonArguments args) } -static std::chrono::seconds getLockTimeout(ContextPtr context) +static std::chrono::seconds getLockTimeout(const ContextPtr & context) { const Settings & settings = context->getSettingsRef(); Int64 lock_timeout = settings.lock_acquire_timeout.totalSeconds(); @@ -933,9 +1098,9 @@ StorageFileSource::FilesIterator::FilesIterator( std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context_, + const ContextPtr & context_, bool distributed_processing_) - : files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_), context(context_) + : WithContext(context_), files(files_), archive_info(std::move(archive_info_)), distributed_processing(distributed_processing_) { ActionsDAGPtr filter_dag; if (!distributed_processing && !archive_info && !files.empty()) @@ -948,7 +1113,7 @@ StorageFileSource::FilesIterator::FilesIterator( String StorageFileSource::FilesIterator::next() { if (distributed_processing) - return context->getReadTaskCallback()(); + return getContext()->getReadTaskCallback()(); else { const auto & fs = isReadFromArchive() ? archive_info->paths_to_archives : files; @@ -972,12 +1137,12 @@ const String & StorageFileSource::FilesIterator::getFileNameInArchive() StorageFileSource::StorageFileSource( const ReadFromFormatInfo & info, std::shared_ptr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, FilesIteratorPtr files_iterator_, std::unique_ptr read_buf_, bool need_only_count_) - : SourceWithKeyCondition(info.source_header, false) + : SourceWithKeyCondition(info.source_header, false), WithContext(context_) , storage(std::move(storage_)) , files_iterator(std::move(files_iterator_)) , read_buf(std::move(read_buf_)) @@ -985,13 +1150,12 @@ StorageFileSource::StorageFileSource( , requested_columns(info.requested_columns) , requested_virtual_columns(info.requested_virtual_columns) , block_for_format(info.format_header) - , context(context_) , max_block_size(max_block_size_) , need_only_count(need_only_count_) { if (!storage->use_table_fd) { - shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(context)); + shared_lock = std::shared_lock(storage->rwlock, getLockTimeout(getContext())); if (!shared_lock) throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Lock timeout exceeded"); storage->readers_counter.fetch_add(1, std::memory_order_release); @@ -1008,7 +1172,7 @@ void StorageFileSource::beforeDestroy() if (std::uncaught_exceptions() == 0 && cnt == 1 && !storage->was_renamed) { shared_lock.unlock(); - auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(context)}; + auto exclusive_lock = std::unique_lock{storage->rwlock, getLockTimeout(getContext())}; if (!exclusive_lock) return; @@ -1027,7 +1191,7 @@ void StorageFileSource::beforeDestroy() file_path = file_path.lexically_normal(); // Checking access rights - checkCreationIsAllowed(context, context->getUserFilesPath(), file_path, true); + checkCreationIsAllowed(getContext(), getContext()->getUserFilesPath(), file_path, true); // Checking an existing of new file if (fs::exists(file_path)) @@ -1060,7 +1224,7 @@ void StorageFileSource::setKeyCondition(const ActionsDAG::NodeRawConstPtrs & nod bool StorageFileSource::tryGetCountFromCache(const struct stat & file_stat) { - if (!context->getSettingsRef().use_cache_for_count_from_files) + if (!getContext()->getSettingsRef().use_cache_for_count_from_files) return false; auto num_rows_from_cache = tryGetNumRowsFromCache(current_path, file_stat.st_mtime); @@ -1102,7 +1266,7 @@ Chunk StorageFileSource::generate() return {}; auto file_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; archive_reader = createArchiveReader(archive); @@ -1116,7 +1280,7 @@ Chunk StorageFileSource::generate() if (!read_buf) continue; - if (auto progress_callback = context->getFileProgressCallback()) + if (auto progress_callback = getContext()->getFileProgressCallback()) progress_callback(FileProgress(0, tryGetFileSizeFromReadBuffer(*read_buf).value_or(0))); } else @@ -1130,7 +1294,7 @@ Chunk StorageFileSource::generate() return {}; current_archive_stat = getFileStat(archive, storage->use_table_fd, storage->table_fd, storage->getName()); - if (context->getSettingsRef().engine_file_skip_empty_files && current_archive_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && current_archive_stat.st_size == 0) continue; archive_reader = createArchiveReader(archive); @@ -1164,7 +1328,7 @@ Chunk StorageFileSource::generate() continue; read_buf = archive_reader->readFile(std::move(file_enumerator)); - if (auto progress_callback = context->getFileProgressCallback()) + if (auto progress_callback = getContext()->getFileProgressCallback()) progress_callback(FileProgress(0, tryGetFileSizeFromReadBuffer(*read_buf).value_or(0))); } } @@ -1190,16 +1354,16 @@ Chunk StorageFileSource::generate() file_stat = getFileStat(current_path, storage->use_table_fd, storage->table_fd, storage->getName()); current_file_size = file_stat.st_size; - if (context->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) + if (getContext()->getSettingsRef().engine_file_skip_empty_files && file_stat.st_size == 0) continue; if (need_only_count && tryGetCountFromCache(file_stat)) continue; - read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, context); + read_buf = createReadBuffer(current_path, file_stat, storage->use_table_fd, storage->table_fd, storage->compression_method, getContext()); } - const Settings & settings = context->getSettingsRef(); + const Settings & settings = getContext()->getSettingsRef(); size_t file_num = 0; if (storage->archive_info) @@ -1211,7 +1375,7 @@ Chunk StorageFileSource::generate() const auto max_parsing_threads = std::max(settings.max_threads / file_num, 1UL); input_format = FormatFactory::instance().getInput( - storage->format_name, *read_buf, block_for_format, context, max_block_size, storage->format_settings, + storage->format_name, *read_buf, block_for_format, getContext(), max_block_size, storage->format_settings, max_parsing_threads, std::nullopt, /*is_remote_fs*/ false, CompressionMethod::None, need_only_count); if (key_condition) @@ -1227,7 +1391,7 @@ Chunk StorageFileSource::generate() { builder.addSimpleTransform([&](const Block & header) { - return std::make_shared(header, columns_description, *input_format, context); + return std::make_shared(header, columns_description, *input_format, getContext()); }); } @@ -1264,7 +1428,7 @@ Chunk StorageFileSource::generate() if (storage->use_table_fd) finished_generate = true; - if (input_format && storage->format_name != "Distributed" && context->getSettingsRef().use_cache_for_count_from_files) + if (input_format && storage->format_name != "Distributed" && getContext()->getSettingsRef().use_cache_for_count_from_files) addNumRowsToCache(current_path, total_rows_in_file); total_rows_in_file = 0; @@ -1295,14 +1459,14 @@ Chunk StorageFileSource::generate() void StorageFileSource::addNumRowsToCache(const String & path, size_t num_rows) const { - auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, context); - StorageFile::getSchemaCache(context).addNumRows(key, num_rows); + auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, getContext()); + StorageFile::getSchemaCache(getContext()).addNumRows(key, num_rows); } std::optional StorageFileSource::tryGetNumRowsFromCache(const String & path, time_t last_mod_time) const { - auto & schema_cache = StorageFile::getSchemaCache(context); - auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, context); + auto & schema_cache = StorageFile::getSchemaCache(getContext()); + auto key = getKeyForSchemaCache(path, storage->format_name, storage->format_settings, getContext()); auto get_last_mod_time = [&]() -> std::optional { return last_mod_time; @@ -1311,7 +1475,7 @@ std::optional StorageFileSource::tryGetNumRowsFromCache(const String & p return schema_cache.tryGetNumRows(key, get_last_mod_time); } -class ReadFromFile : public SourceStepWithFilter +class ReadFromFile : public SourceStepWithFilter, WithContext { public: std::string getName() const override { return "ReadFromFile"; } @@ -1323,14 +1487,13 @@ public: std::shared_ptr storage_, ReadFromFormatInfo info_, const bool need_only_count_, - ContextPtr context_, + const ContextPtr & context_, size_t max_block_size_, size_t num_streams_) - : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}) + : SourceStepWithFilter(DataStream{.header = std::move(sample_block)}), WithContext(context_) , storage(std::move(storage_)) , info(std::move(info_)) , need_only_count(need_only_count_) - , context(std::move(context_)) , max_block_size(max_block_size_) , max_num_streams(num_streams_) { @@ -1341,7 +1504,6 @@ private: ReadFromFormatInfo info; const bool need_only_count; - ContextPtr context; size_t max_block_size; const size_t max_num_streams; @@ -1352,7 +1514,7 @@ private: void ReadFromFile::applyFilters() { - auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes, {}, context); + auto filter_actions_dag = ActionsDAG::buildFilterActionsDAG(filter_nodes.nodes, {}, getContext()); const ActionsDAG::Node * predicate = nullptr; if (filter_actions_dag) predicate = filter_actions_dag->getOutputs().at(0); @@ -1422,7 +1584,7 @@ void ReadFromFile::createIterator(const ActionsDAG::Node * predicate) storage->archive_info, predicate, storage->virtual_columns, - context, + getContext(), storage->distributed_processing); } @@ -1444,8 +1606,10 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui Pipes pipes; pipes.reserve(num_streams); + auto ctx = getContext(); + /// Set total number of bytes to process. For progress bar. - auto progress_callback = context->getFileProgressCallback(); + auto progress_callback = ctx->getFileProgressCallback(); if (progress_callback && !storage->archive_info) progress_callback(FileProgress(0, storage->total_bytes_to_read)); @@ -1463,20 +1627,20 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui auto source = std::make_shared( info, storage, - context, + ctx, max_block_size, files_iterator, std::move(read_buffer), need_only_count); - source->setKeyCondition(filter_nodes.nodes, context); + source->setKeyCondition(filter_nodes.nodes, ctx); pipes.emplace_back(std::move(source)); } auto pipe = Pipe::unitePipes(std::move(pipes)); size_t output_ports = pipe.numOutputPorts(); - const bool parallelize_output = context->getSettingsRef().parallelize_output_from_storages; - if (parallelize_output && storage->parallelizeOutputAfterReading(context) && output_ports > 0 && output_ports < max_num_streams) + const bool parallelize_output = ctx->getSettingsRef().parallelize_output_from_storages; + if (parallelize_output && storage->parallelizeOutputAfterReading(ctx) && output_ports > 0 && output_ports < max_num_streams) pipe.resize(max_num_streams); if (pipe.empty()) @@ -1489,7 +1653,7 @@ void ReadFromFile::initializePipeline(QueryPipelineBuilder & pipeline, const Bui } -class StorageFileSink final : public SinkToStorage +class StorageFileSink final : public SinkToStorage, WithContext { public: StorageFileSink( @@ -1502,9 +1666,9 @@ public: const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, - ContextPtr context_, + const ContextPtr & context_, int flags_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) + : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) , table_fd(table_fd_) @@ -1514,7 +1678,6 @@ public: , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) - , context(context_) , flags(flags_) { initialize(); @@ -1531,9 +1694,9 @@ public: const CompressionMethod compression_method_, const std::optional & format_settings_, const String format_name_, - ContextPtr context_, + const ContextPtr & context_, int flags_) - : SinkToStorage(metadata_snapshot_->getSampleBlock()) + : SinkToStorage(metadata_snapshot_->getSampleBlock()), WithContext(context_) , metadata_snapshot(metadata_snapshot_) , table_name_for_log(table_name_for_log_) , table_fd(table_fd_) @@ -1543,7 +1706,6 @@ public: , compression_method(compression_method_) , format_name(format_name_) , format_settings(format_settings_) - , context(context_) , flags(flags_) , lock(std::move(lock_)) { @@ -1567,7 +1729,7 @@ public: /// In case of formats with prefixes if file is not empty we have already written prefix. bool do_not_write_prefix = naked_buffer->size(); - const auto & settings = context->getSettingsRef(); + const auto & settings = getContext()->getSettingsRef(); write_buf = wrapWriteBufferWithCompressionMethod( std::move(naked_buffer), compression_method, @@ -1575,7 +1737,7 @@ public: static_cast(settings.output_format_compression_zstd_window_log)); writer = FormatFactory::instance().getOutputFormatParallelIfPossible(format_name, - *write_buf, metadata_snapshot->getSampleBlock(), context, format_settings); + *write_buf, metadata_snapshot->getSampleBlock(), getContext(), format_settings); if (do_not_write_prefix) writer->doNotWritePrefix(); @@ -1658,7 +1820,6 @@ private: std::string format_name; std::optional format_settings; - ContextPtr context; int flags; std::unique_lock lock; @@ -2043,7 +2204,7 @@ StorageFile::ArchiveInfo StorageFile::getArchiveInfo( const std::string & path_to_archive, const std::string & file_in_archive, const std::string & user_files_path, - ContextPtr context, + const ContextPtr & context, size_t & total_bytes_to_read ) { diff --git a/src/Storages/StorageFile.h b/src/Storages/StorageFile.h index b74868597a6..a5ccbc8f506 100644 --- a/src/Storages/StorageFile.h +++ b/src/Storages/StorageFile.h @@ -84,7 +84,7 @@ public: static Names getVirtualColumnNames(); - static Strings getPathsList(const String & table_path, const String & user_files_path, ContextPtr context, size_t & total_bytes_to_read); + static Strings getPathsList(const String & table_path, const String & user_files_path, const ContextPtr & context, size_t & total_bytes_to_read); /// Check if the format supports reading only some subset of columns. /// Is is useful because such formats could effectively skip unknown columns @@ -112,14 +112,19 @@ public: } }; - ColumnsDescription getTableStructureFromFileDescriptor(ContextPtr context); - static ColumnsDescription getTableStructureFromFile( const String & format, const std::vector & paths, const String & compression_method, const std::optional & format_settings, - ContextPtr context, + const ContextPtr & context, + const std::optional & archive_info = std::nullopt); + + static std::pair getTableStructureAndFormatFromFile( + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, const std::optional & archive_info = std::nullopt); static SchemaCache & getSchemaCache(const ContextPtr & context); @@ -130,7 +135,7 @@ public: const std::string & path_to_archive, const std::string & file_in_archive, const std::string & user_files_path, - ContextPtr context, + const ContextPtr & context, size_t & total_bytes_to_read); bool supportsTrivialCountOptimization() const override { return true; } @@ -141,6 +146,16 @@ protected: friend class ReadFromFile; private: + std::pair getTableStructureAndFormatFromFileDescriptor(std::optional format, const ContextPtr & context); + + static std::pair getTableStructureAndFormatFromFileImpl( + std::optional format, + const std::vector & paths, + const String & compression_method, + const std::optional & format_settings, + const ContextPtr & context, + const std::optional & archive_info = std::nullopt); + void setStorageMetadata(CommonArguments args); std::string format_name; @@ -187,10 +202,10 @@ private: bool distributed_processing = false; }; -class StorageFileSource : public SourceWithKeyCondition +class StorageFileSource : public SourceWithKeyCondition, WithContext { public: - class FilesIterator + class FilesIterator : WithContext { public: explicit FilesIterator( @@ -198,7 +213,7 @@ public: std::optional archive_info_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context_, + const ContextPtr & context_, bool distributed_processing_ = false); String next(); @@ -227,8 +242,6 @@ private: std::atomic index = 0; bool distributed_processing; - - ContextPtr context; }; using FilesIteratorPtr = std::shared_ptr; @@ -236,7 +249,7 @@ private: StorageFileSource( const ReadFromFormatInfo & info, std::shared_ptr storage_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size_, FilesIteratorPtr files_iterator_, std::unique_ptr read_buf_, @@ -286,7 +299,6 @@ private: NamesAndTypesList requested_virtual_columns; Block block_for_format; - ContextPtr context; /// TODO Untangle potential issues with context lifetime. UInt64 max_block_size; bool finished_generate = false; diff --git a/src/Storages/StorageFileCluster.cpp b/src/Storages/StorageFileCluster.cpp index c12124f1e07..65eec0a7ea1 100644 --- a/src/Storages/StorageFileCluster.cpp +++ b/src/Storages/StorageFileCluster.cpp @@ -25,36 +25,39 @@ extern const int LOGICAL_ERROR; } StorageFileCluster::StorageFileCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & filename_, const String & format_name_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageFileCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ConstraintsDescription & constraints_) + : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageFileCluster (" + table_id_.table_name + ")")) , filename(filename_) , format_name(format_name_) - , compression_method(compression_method_) { StorageInMemoryMetadata storage_metadata; size_t total_bytes_to_read; // its value isn't used as we are not reading files (just listing them). But it is required by getPathsList - paths = StorageFile::getPathsList(filename_, context_->getUserFilesPath(), context_, total_bytes_to_read); + paths = StorageFile::getPathsList(filename_, context->getUserFilesPath(), context, total_bytes_to_read); if (columns_.empty()) { - auto columns = StorageFile::getTableStructureFromFile(format_name, - paths, - compression_method, - std::nullopt, - context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context); + else + columns = StorageFile::getTableStructureFromFile(format_name, paths, compression_method, std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context).second; storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -62,13 +65,14 @@ StorageFileCluster::StorageFileCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageFileCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageFileCluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function fileCluster, got '{}'", queryToString(query)); - TableFunctionFileCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionFileCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } RemoteQueryExecutor::Extension StorageFileCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageFileCluster.h b/src/Storages/StorageFileCluster.h index a6e57c3bb4f..2803c8b6e5b 100644 --- a/src/Storages/StorageFileCluster.h +++ b/src/Storages/StorageFileCluster.h @@ -17,15 +17,14 @@ class StorageFileCluster : public IStorageCluster { public: StorageFileCluster( - ContextPtr context_, + const ContextPtr & context_, const String & cluster_name_, const String & filename_, const String & format_name_, const String & compression_method_, const StorageID & table_id_, const ColumnsDescription & columns_, - const ConstraintsDescription & constraints_, - bool structure_argument_was_provided_); + const ConstraintsDescription & constraints_); std::string getName() const override { return "FileCluster"; } @@ -38,12 +37,11 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; Strings paths; String filename; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index c376af5a3d7..8e5b6040a63 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -129,6 +129,7 @@ namespace ErrorCodes extern const int UNEXPECTED_EXPRESSION; extern const int DATABASE_ACCESS_DENIED; extern const int CANNOT_EXTRACT_TABLE_STRUCTURE; + extern const int CANNOT_DETECT_FORMAT; extern const int NOT_IMPLEMENTED; extern const int CANNOT_COMPILE_REGEXP; extern const int FILE_DOESNT_EXIST; @@ -428,7 +429,7 @@ StorageS3Source::DisclosedGlobIterator::DisclosedGlobIterator( const S3::URI & globbed_uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns_, - ContextPtr context, + const ContextPtr & context, KeysWithInfo * read_keys_, const S3Settings::RequestSettings & request_settings_, std::function file_progress_callback_) @@ -563,7 +564,7 @@ StorageS3Source::StorageS3Source( const ReadFromFormatInfo & info, const String & format_, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, const S3Settings::RequestSettings & request_settings_, @@ -841,7 +842,7 @@ public: StorageS3Sink( const String & format, const Block & sample_block_, - ContextPtr context, + const ContextPtr & context, std::optional format_settings_, const CompressionMethod compression_method, const StorageS3::Configuration & configuration_, @@ -949,23 +950,22 @@ private: }; -class PartitionedStorageS3Sink : public PartitionedSink +class PartitionedStorageS3Sink : public PartitionedSink, WithContext { public: PartitionedStorageS3Sink( const ASTPtr & partition_by, const String & format_, const Block & sample_block_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, const CompressionMethod compression_method_, const StorageS3::Configuration & configuration_, const String & bucket_, const String & key_) - : PartitionedSink(partition_by, context_, sample_block_) + : PartitionedSink(partition_by, context_, sample_block_), WithContext(context_) , format(format_) , sample_block(sample_block_) - , context(context_) , compression_method(compression_method_) , configuration(configuration_) , bucket(bucket_) @@ -985,7 +985,7 @@ public: return std::make_shared( format, sample_block, - context, + getContext(), format_settings, compression_method, configuration, @@ -997,7 +997,6 @@ public: private: const String format; const Block sample_block; - const ContextPtr context; const CompressionMethod compression_method; const StorageS3::Configuration configuration; const String bucket; @@ -1033,7 +1032,7 @@ private: StorageS3::StorageS3( const Configuration & configuration_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -1050,18 +1049,27 @@ StorageS3::StorageS3( { updateConfiguration(context_); // NOLINT(clang-analyzer-optin.cplusplus.VirtualCall) - FormatFactory::instance().checkFormatName(configuration.format); + if (configuration.format != "auto") + FormatFactory::instance().checkFormatName(configuration.format); context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration.url.uri); context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration.headers_from_ast); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromDataImpl(configuration, format_settings, context_); + ColumnsDescription columns; + if (configuration.format == "auto") + std::tie(columns, configuration.format) = getTableStructureAndFormatFromData(configuration, format_settings, context_); + else + columns = getTableStructureFromData(configuration, format_settings, context_); + storage_metadata.setColumns(columns); } else { + if (configuration.format == "auto") + configuration.format = getTableStructureAndFormatFromData(configuration, format_settings, context_).second; + /// We don't allow special columns in S3 storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine S3 doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -1350,14 +1358,14 @@ void StorageS3::truncate(const ASTPtr & /* query */, const StorageMetadataPtr &, LOG_WARNING(&Poco::Logger::get("StorageS3"), "Failed to delete {}, error: {}", error.GetKey(), error.GetMessage()); } -StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(ContextPtr local_context) +StorageS3::Configuration StorageS3::updateConfigurationAndGetCopy(const ContextPtr & local_context) { std::lock_guard lock(configuration_update_mutex); configuration.update(local_context); return configuration; } -void StorageS3::updateConfiguration(ContextPtr local_context) +void StorageS3::updateConfiguration(const ContextPtr & local_context) { std::lock_guard lock(configuration_update_mutex); configuration.update(local_context); @@ -1375,7 +1383,7 @@ const StorageS3::Configuration & StorageS3::getConfiguration() return configuration; } -bool StorageS3::Configuration::update(ContextPtr context) +bool StorageS3::Configuration::update(const ContextPtr & context) { auto s3_settings = context->getStorageS3Settings().getSettings(url.uri.toString()); request_settings = s3_settings.request_settings; @@ -1390,7 +1398,7 @@ bool StorageS3::Configuration::update(ContextPtr context) return true; } -void StorageS3::Configuration::connect(ContextPtr context) +void StorageS3::Configuration::connect(const ContextPtr & context) { const Settings & global_settings = context->getGlobalContext()->getSettingsRef(); const Settings & local_settings = context->getSettingsRef(); @@ -1462,7 +1470,7 @@ void StorageS3::processNamedCollectionResult(StorageS3::Configuration & configur configuration.request_settings = S3Settings::RequestSettings(collection); } -StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file) +StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file) { StorageS3::Configuration configuration; @@ -1601,7 +1609,7 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context configuration.keys = {configuration.url.key}; if (configuration.format == "auto" && get_format_from_file) - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.url.key, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.url.key).value_or("auto"); return configuration; } @@ -1609,9 +1617,17 @@ StorageS3::Configuration StorageS3::getConfiguration(ASTs & engine_args, Context ColumnsDescription StorageS3::getTableStructureFromData( const StorageS3::Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx) + const ContextPtr & ctx) { - return getTableStructureFromDataImpl(configuration, format_settings, ctx); + return getTableStructureAndFormatFromDataImpl(configuration.format, configuration, format_settings, ctx).first; +} + +std::pair StorageS3::getTableStructureAndFormatFromData( + const StorageS3::Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, configuration, format_settings, ctx); } namespace @@ -1623,24 +1639,43 @@ namespace std::shared_ptr file_iterator_, const StorageS3Source::KeysWithInfo & read_keys_, const StorageS3::Configuration & configuration_, + std::optional format_, const std::optional & format_settings_, const ContextPtr & context_) : WithContext(context_) , file_iterator(file_iterator_) , read_keys(read_keys_) , configuration(configuration_) + , format(std::move(format_)) , format_settings(format_settings_) , prev_read_keys_size(read_keys_.size()) { } - std::pair, std::optional> next() override + Data next() override { - /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key_with_info : read_keys) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(key_with_info->key)) + { + format = format_from_file_name; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } while (true) @@ -1650,13 +1685,34 @@ namespace if (!current_key_with_info || current_key_with_info->key.empty()) { if (first) - throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because there are no files with provided path " - "in S3 or all files are empty. You must specify table structure manually", - configuration.format); + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because there are no files with provided path " + "in S3 or all files are empty. You can specify table structure manually", + *format); - return {nullptr, std::nullopt}; + throw Exception( + ErrorCodes::CANNOT_DETECT_FORMAT, + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "in S3 or all files are empty. You can specify the format manually"); + } + + return {nullptr, std::nullopt, format}; + } + + /// S3 file iterator could get new keys after new iteration, if format is unknown we can try to determine it by new file names. + if (!format && read_keys.size() > prev_read_keys_size) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + { + format = format_from_file_name; + break; + } + } } /// S3 file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. @@ -1665,9 +1721,11 @@ namespace auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); prev_read_keys_size = read_keys.size(); if (columns_from_cache) - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } + prev_read_keys_size = read_keys.size(); + if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0) continue; @@ -1678,7 +1736,7 @@ namespace if (auto columns_from_cache = tryGetColumnsFromCache(keys.begin(), keys.end())) { first = false; - return {nullptr, columns_from_cache}; + return {nullptr, columns_from_cache, format}; } } @@ -1687,7 +1745,7 @@ namespace if (!getContext()->getSettingsRef().s3_skip_empty_files || !impl->eof()) { first = false; - return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt}; + return {wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max), std::nullopt, format}; } } } @@ -1698,7 +1756,7 @@ namespace return; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -1709,7 +1767,7 @@ namespace return; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / configuration.url.bucket / current_key_with_info->key; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addColumns(cache_key, columns); } @@ -1723,10 +1781,15 @@ namespace Strings sources; sources.reserve(read_keys.size()); std::transform(read_keys.begin(), read_keys.end(), std::back_inserter(sources), [&](const auto & elem){ return host_and_bucket / elem->key; }); - auto cache_keys = getKeysForSchemaCache(sources, configuration.format, format_settings, getContext()); + auto cache_keys = getKeysForSchemaCache(sources, *format, format_settings, getContext()); StorageS3::getSchemaCache(getContext()).addManyColumns(cache_keys, columns); } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { if (current_key_with_info) @@ -1734,15 +1797,26 @@ namespace return ""; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_key_with_info); + int zstd_window_log_max = static_cast(getContext()->getSettingsRef().zstd_window_log_max); + auto impl = std::make_unique(configuration.client, configuration.url.bucket, current_key_with_info->key, configuration.url.version_id, configuration.request_settings, getContext()->getReadSettings()); + return wrapReadBufferWithCompressionMethod(std::move(impl), chooseCompressionMethod(current_key_with_info->key, configuration.compression_method), zstd_window_log_max); + } + private: std::optional tryGetColumnsFromCache( const StorageS3::KeysWithInfo::const_iterator & begin, const StorageS3::KeysWithInfo::const_iterator & end) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_s3) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_s3) return std::nullopt; - auto & schema_cache = StorageS3::getSchemaCache(getContext()); + auto & schema_cache = StorageS3::getSchemaCache(context); for (auto it = begin; it < end; ++it) { auto get_last_mod_time = [&] @@ -1773,10 +1847,29 @@ namespace String path = fs::path(configuration.url.bucket) / (*it)->key; String source = fs::path(configuration.url.uri.getHost() + std::to_string(configuration.url.uri.getPort())) / path; - auto cache_key = getKeyForSchemaCache(source, configuration.format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + + if (format) + { + auto cache_key = getKeyForSchemaCache(source, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(source, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + /// Now format is known. It should be the same for all files. + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -1785,6 +1878,7 @@ namespace std::shared_ptr file_iterator; const StorageS3Source::KeysWithInfo & read_keys; const StorageS3::Configuration & configuration; + std::optional format; const std::optional & format_settings; StorageS3Source::KeyWithInfoPtr current_key_with_info; size_t prev_read_keys_size; @@ -1793,17 +1887,20 @@ namespace } -ColumnsDescription StorageS3::getTableStructureFromDataImpl( +std::pair StorageS3::getTableStructureAndFormatFromDataImpl( + std::optional format, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx) + const ContextPtr & ctx) { KeysWithInfo read_keys; auto file_iterator = createFileIterator(configuration, false, ctx, {}, {}, &read_keys); - ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format_settings, ctx); - return readSchemaFromFormat(configuration.format, format_settings, read_buffer_iterator, configuration.withGlobs(), ctx); + ReadBufferIterator read_buffer_iterator(file_iterator, read_keys, configuration, format, format_settings, ctx); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, ctx), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, ctx); } void registerStorageS3Impl(const String & name, StorageFactory & factory) diff --git a/src/Storages/StorageS3.h b/src/Storages/StorageS3.h index b90a0d394cb..cb3c3f4b947 100644 --- a/src/Storages/StorageS3.h +++ b/src/Storages/StorageS3.h @@ -80,7 +80,7 @@ public: const S3::URI & globbed_uri_, const ActionsDAG::Node * predicate, const NamesAndTypesList & virtual_columns, - ContextPtr context, + const ContextPtr & context, KeysWithInfo * read_keys_ = nullptr, const S3Settings::RequestSettings & request_settings_ = {}, std::function progress_callback_ = {}); @@ -134,7 +134,7 @@ public: const ReadFromFormatInfo & info, const String & format, String name_, - ContextPtr context_, + const ContextPtr & context_, std::optional format_settings_, UInt64 max_block_size_, const S3Settings::RequestSettings & request_settings_, @@ -280,9 +280,9 @@ public: String getPath() const { return url.key; } - bool update(ContextPtr context); + bool update(const ContextPtr & context); - void connect(ContextPtr context); + void connect(const ContextPtr & context); bool withGlobs() const { return url.key.find_first_of("*?{") != std::string::npos; } @@ -308,7 +308,7 @@ public: StorageS3( const Configuration & configuration_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, @@ -345,21 +345,26 @@ public: static SchemaCache & getSchemaCache(const ContextPtr & ctx); - static StorageS3::Configuration getConfiguration(ASTs & engine_args, ContextPtr local_context, bool get_format_from_file = true); + static StorageS3::Configuration getConfiguration(ASTs & engine_args, const ContextPtr & local_context, bool get_format_from_file = true); static ColumnsDescription getTableStructureFromData( const StorageS3::Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx); + const ContextPtr & ctx); + + static std::pair getTableStructureAndFormatFromData( + const StorageS3::Configuration & configuration, + const std::optional & format_settings, + const ContextPtr & ctx); using KeysWithInfo = StorageS3Source::KeysWithInfo; bool supportsTrivialCountOptimization() const override { return true; } protected: - virtual Configuration updateConfigurationAndGetCopy(ContextPtr local_context); + virtual Configuration updateConfigurationAndGetCopy(const ContextPtr & local_context); - virtual void updateConfiguration(ContextPtr local_context); + virtual void updateConfiguration(const ContextPtr & local_context); void useConfiguration(const Configuration & new_configuration); @@ -380,10 +385,11 @@ private: std::optional format_settings; ASTPtr partition_by; - static ColumnsDescription getTableStructureFromDataImpl( + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, const Configuration & configuration, const std::optional & format_settings, - ContextPtr ctx); + const ContextPtr & ctx); bool supportsSubcolumns() const override { return true; } diff --git a/src/Storages/StorageS3Cluster.cpp b/src/Storages/StorageS3Cluster.cpp index e1738056e9d..5264372889e 100644 --- a/src/Storages/StorageS3Cluster.cpp +++ b/src/Storages/StorageS3Cluster.cpp @@ -38,25 +38,34 @@ StorageS3Cluster::StorageS3Cluster( const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageS3Cluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + const ContextPtr & context) + : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageS3Cluster (" + table_id_.table_name + ")")) , s3_configuration{configuration_} { - context_->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); - context_->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); + context->getGlobalContext()->getRemoteHostFilter().checkURL(configuration_.url.uri); + context->getGlobalContext()->getHTTPHeaderFilter().checkHeaders(configuration_.headers_from_ast); StorageInMemoryMetadata storage_metadata; - updateConfigurationIfChanged(context_); + updateConfigurationIfChanged(context); if (columns_.empty()) { + ColumnsDescription columns; /// `format_settings` is set to std::nullopt, because StorageS3Cluster is used only as table function - auto columns = StorageS3::getTableStructureFromDataImpl(s3_configuration, /*format_settings=*/std::nullopt, context_); + if (s3_configuration.format == "auto") + std::tie(columns, s3_configuration.format) = StorageS3::getTableStructureAndFormatFromData(s3_configuration, /*format_settings=*/std::nullopt, context); + else + columns = StorageS3::getTableStructureFromData(s3_configuration, /*format_settings=*/std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (s3_configuration.format == "auto") + s3_configuration.format = StorageS3::getTableStructureAndFormatFromData(s3_configuration, /*format_settings=*/std::nullopt, context).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -64,13 +73,17 @@ StorageS3Cluster::StorageS3Cluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageS3Cluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageS3Cluster::updateQueryToSendIfNeeded(DB::ASTPtr & query, const DB::StorageSnapshotPtr & storage_snapshot, const DB::ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function s3Cluster, got '{}'", queryToString(query)); - TableFunctionS3Cluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionS3Cluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, + storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), + s3_configuration.format, + context); } void StorageS3Cluster::updateConfigurationIfChanged(ContextPtr local_context) diff --git a/src/Storages/StorageS3Cluster.h b/src/Storages/StorageS3Cluster.h index c526f14834a..ac25c506337 100644 --- a/src/Storages/StorageS3Cluster.h +++ b/src/Storages/StorageS3Cluster.h @@ -27,8 +27,7 @@ public: const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, - bool structure_argument_was_provided_); + const ContextPtr & context_); std::string getName() const override { return "S3Cluster"; } @@ -46,7 +45,7 @@ protected: private: void updateBeforeRead(const ContextPtr & context) override { updateConfigurationIfChanged(context); } - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; StorageS3::Configuration s3_configuration; NamesAndTypesList virtual_columns; diff --git a/src/Storages/StorageURL.cpp b/src/Storages/StorageURL.cpp index 0ba72af6fc0..a68ed6965fc 100644 --- a/src/Storages/StorageURL.cpp +++ b/src/Storages/StorageURL.cpp @@ -101,7 +101,7 @@ static ConnectionTimeouts getHTTPTimeouts(ContextPtr context) IStorageURLBase::IStorageURLBase( const String & uri_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & table_id_, const String & format_name_, const std::optional & format_settings_, @@ -123,16 +123,26 @@ IStorageURLBase::IStorageURLBase( , partition_by(partition_by_) , distributed_processing(distributed_processing_) { - FormatFactory::instance().checkFormatName(format_name); + if (format_name != "auto") + FormatFactory::instance().checkFormatName(format_name); + StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = getTableStructureAndFormatFromData(uri, compression_method, headers, format_settings, context_); + else + columns = getTableStructureFromData(format_name, uri, compression_method, headers, format_settings, context_); + storage_metadata.setColumns(columns); } else { + if (format_name == "auto") + format_name = getTableStructureAndFormatFromData(uri, compression_method, headers, format_settings, context_).second; + /// We don't allow special columns in URL storage. if (!columns_.hasOnlyOrdinary()) throw Exception(ErrorCodes::BAD_ARGUMENTS, "Table engine URL doesn't support special columns like MATERIALIZED, ALIAS or EPHEMERAL"); @@ -257,7 +267,7 @@ StorageURLSource::StorageURLSource( const String & format_, const std::optional & format_settings_, String name_, - ContextPtr context_, + const ContextPtr & context_, UInt64 max_block_size, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, @@ -525,7 +535,7 @@ StorageURLSink::StorageURLSink( const String & format, const std::optional & format_settings, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const ConnectionTimeouts & timeouts, const CompressionMethod compression_method, const HTTPHeaderEntries & headers, @@ -668,7 +678,7 @@ std::vector> IStorageURLBase::getReadURIPara const Names & /*column_names*/, const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { @@ -679,7 +689,7 @@ std::function IStorageURLBase::getReadPOSTDataCallback( const Names & /*column_names*/, const ColumnsDescription & /* columns_description */, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { @@ -693,28 +703,48 @@ namespace public: ReadBufferIterator( const std::vector & urls_to_check_, - const String & format_, + std::optional format_, const CompressionMethod & compression_method_, const HTTPHeaderEntries & headers_, const std::optional & format_settings_, const ContextPtr & context_) - : WithContext(context_), format(format_), compression_method(compression_method_), headers(headers_), format_settings(format_settings_) + : WithContext(context_), format(std::move(format_)), compression_method(compression_method_), headers(headers_), format_settings(format_settings_) { url_options_to_check.reserve(urls_to_check_.size()); for (const auto & url : urls_to_check_) url_options_to_check.push_back(getFailoverOptions(url, getContext()->getSettingsRef().glob_expansion_max_elements)); } - std::pair, std::optional> next() override + Data next() override { bool is_first = (current_index == 0); - /// For default mode check cached columns for all urls on first iteration. - if (is_first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (is_first) { - for (const auto & options : url_options_to_check) + /// If format is unknown we iterate through all url options on first iteration and + /// try to determine format by file name. + if (!format) { - if (auto cached_columns = tryGetColumnsFromCache(options)) - return {nullptr, cached_columns}; + for (const auto & options : url_options_to_check) + { + for (const auto & url : options) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(url)) + { + format = format_from_file_name; + break; + } + } + } + } + + /// For default mode check cached columns for all urls on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + for (const auto & options : url_options_to_check) + { + if (auto cached_columns = tryGetColumnsFromCache(options)) + return {nullptr, cached_columns, format}; + } } } @@ -724,20 +754,30 @@ namespace if (current_index == url_options_to_check.size()) { if (is_first) + { + if (format) + throw Exception( + ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + "The table structure cannot be extracted from a {} format file, because all files are empty. " + "You can specify table structure manually", + *format); + throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "Cannot extract table structure from {} format file, because all files are empty. " - "You must specify table structure manually", - format); - return {nullptr, std::nullopt}; + "The data format cannot be detected by the contents of the files, because there are no files with provided path " + "You can specify the format manually"); + + } + + return {nullptr, std::nullopt, format}; } if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { - if (auto cached_columns = tryGetColumnsFromCache(url_options_to_check[current_index])) + if (auto cached_schema = tryGetColumnsFromCache(url_options_to_check[current_index])) { ++current_index; - return {nullptr, cached_columns}; + return {nullptr, cached_schema, format}; } } @@ -762,7 +802,7 @@ namespace return {wrapReadBufferWithCompressionMethod( std::move(uri_and_buf.second), compression_method, - static_cast(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt}; + static_cast(getContext()->getSettingsRef().zstd_window_log_max)), std::nullopt, format}; } void setNumRowsToLastFile(size_t num_rows) override @@ -770,7 +810,7 @@ namespace if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url) return; - auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(current_url_option, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addNumRows(key, num_rows); } @@ -780,7 +820,7 @@ namespace || getContext()->getSettingsRef().schema_inference_mode != SchemaInferenceMode::UNION) return; - auto key = getKeyForSchemaCache(current_url_option, format, format_settings, getContext()); + auto key = getKeyForSchemaCache(current_url_option, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addColumns(key, columns); } @@ -792,17 +832,45 @@ namespace for (const auto & options : url_options_to_check) { - auto keys = getKeysForSchemaCache(options, format, format_settings, getContext()); + auto keys = getKeysForSchemaCache(options, *format, format_settings, getContext()); StorageURL::getSchemaCache(getContext()).addManyColumns(keys, columns); } } + void setFormatName(const String & format_name) override + { + format = format_name; + } + String getLastFileName() const override { return current_url_option; } + bool supportsLastReadBufferRecreation() const override { return true; } + + std::unique_ptr recreateLastReadBuffer() override + { + chassert(current_index > 0 && current_index <= url_options_to_check.size()); + auto first_option = url_options_to_check[current_index - 1].cbegin(); + auto uri_and_buf = StorageURLSource::getFirstAvailableURIAndReadBuffer( + first_option, + url_options_to_check[current_index - 1].cend(), + getContext(), + {}, + Poco::Net::HTTPRequest::HTTP_GET, + {}, + getHTTPTimeouts(getContext()), + credentials, + headers, + false, + false); + + return wrapReadBufferWithCompressionMethod(std::move(uri_and_buf.second), compression_method, static_cast(getContext()->getSettingsRef().zstd_window_log_max)); + } + private: std::optional tryGetColumnsFromCache(const Strings & urls) { - if (!getContext()->getSettingsRef().schema_inference_use_cache_for_url) + auto context = getContext(); + if (!context->getSettingsRef().schema_inference_use_cache_for_url) return std::nullopt; auto & schema_cache = StorageURL::getSchemaCache(getContext()); @@ -810,7 +878,7 @@ namespace { auto get_last_mod_time = [&]() -> std::optional { - auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, getContext()); + auto last_mod_time = StorageURL::tryGetLastModificationTime(url, headers, credentials, context); /// Some URLs could not have Last-Modified header, in this case we cannot be sure that /// data wasn't changed after adding it's schema to cache. Use schema from cache only if /// special setting for this case is enabled. @@ -819,10 +887,27 @@ namespace return last_mod_time; }; - auto cache_key = getKeyForSchemaCache(url, format, format_settings, getContext()); - auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time); - if (columns) - return columns; + if (format) + { + auto cache_key = getKeyForSchemaCache(url, *format, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + return columns; + } + else + { + /// If format is unknown, we can iterate through all possible input formats + /// and check if we have an entry with this format and this file in schema cache. + /// If we have such entry for some format, we can use this format to read the file. + for (const auto & format_name : FormatFactory::instance().getAllInputFormats()) + { + auto cache_key = getKeyForSchemaCache(url, format_name, format_settings, context); + if (auto columns = schema_cache.tryGetColumns(cache_key, get_last_mod_time)) + { + format = format_name; + return columns; + } + } + } } return std::nullopt; @@ -831,7 +916,7 @@ namespace std::vector> url_options_to_check; size_t current_index = 0; String current_url_option; - const String & format; + std::optional format; const CompressionMethod & compression_method; const HTTPHeaderEntries & headers; Poco::Net::HTTPBasicCredentials credentials; @@ -839,13 +924,13 @@ namespace }; } -ColumnsDescription IStorageURLBase::getTableStructureFromData( - const String & format, +std::pair IStorageURLBase::getTableStructureAndFormatFromDataImpl( + std::optional format, const String & uri, CompressionMethod compression_method, const HTTPHeaderEntries & headers, const std::optional & format_settings, - ContextPtr context) + const ContextPtr & context) { context->getRemoteHostFilter().checkURL(Poco::URI(uri)); @@ -858,7 +943,30 @@ ColumnsDescription IStorageURLBase::getTableStructureFromData( urls_to_check = {uri}; ReadBufferIterator read_buffer_iterator(urls_to_check, format, compression_method, headers, format_settings, context); - return readSchemaFromFormat(format, format_settings, read_buffer_iterator, urls_to_check.size() > 1, context); + if (format) + return {readSchemaFromFormat(*format, format_settings, read_buffer_iterator, context), *format}; + return detectFormatAndReadSchema(format_settings, read_buffer_iterator, context); +} + +ColumnsDescription IStorageURLBase::getTableStructureFromData( + const String & format, + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context) +{ + return getTableStructureAndFormatFromDataImpl(format, uri, compression_method, headers, format_settings, context).first; +} + +std::pair IStorageURLBase::getTableStructureAndFormatFromData( + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context) +{ + return getTableStructureAndFormatFromDataImpl(std::nullopt, uri, compression_method, headers, format_settings, context); } bool IStorageURLBase::supportsSubsetOfColumns(const ContextPtr & context) const @@ -1243,7 +1351,7 @@ StorageURL::StorageURL( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const HTTPHeaderEntries & headers_, const String & http_method_, @@ -1276,7 +1384,7 @@ StorageURLWithFailover::StorageURLWithFailover( const std::optional & format_settings_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_) : StorageURL("", table_id_, format_name_, format_settings_, columns_, constraints_, String{}, context_, compression_method_) { @@ -1325,7 +1433,7 @@ FormatSettings StorageURL::getFormatSettingsFromArgs(const StorageFactory::Argum } size_t StorageURL::evalArgsAndCollectHeaders( - ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context) + ASTs & url_function_args, HTTPHeaderEntries & header_entries, const ContextPtr & context) { ASTs::iterator headers_it = url_function_args.end(); @@ -1409,7 +1517,7 @@ void StorageURL::processNamedCollectionResult(Configuration & configuration, con configuration.structure = collection.getOrDefault("structure", "auto"); } -StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr local_context) +StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, const ContextPtr & local_context) { StorageURL::Configuration configuration; @@ -1433,7 +1541,7 @@ StorageURL::Configuration StorageURL::getConfiguration(ASTs & args, ContextPtr l } if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(configuration.url).getPath(), true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(configuration.url).getPath()).value_or("auto"); for (const auto & [header, value] : configuration.headers) { diff --git a/src/Storages/StorageURL.h b/src/Storages/StorageURL.h index c8b8d0942f4..18a90c7bb82 100644 --- a/src/Storages/StorageURL.h +++ b/src/Storages/StorageURL.h @@ -57,7 +57,15 @@ public: CompressionMethod compression_method, const HTTPHeaderEntries & headers, const std::optional & format_settings, - ContextPtr context); + const ContextPtr & context); + + static std::pair getTableStructureAndFormatFromData( + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context); + static SchemaCache & getSchemaCache(const ContextPtr & context); @@ -72,7 +80,7 @@ protected: IStorageURLBase( const String & uri_, - ContextPtr context_, + const ContextPtr & context_, const StorageID & id_, const String & format_name_, const std::optional & format_settings_, @@ -106,7 +114,7 @@ protected: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; @@ -114,7 +122,7 @@ protected: const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const; @@ -127,6 +135,14 @@ protected: bool supportsTrivialCountOptimization() const override { return true; } private: + static std::pair getTableStructureAndFormatFromDataImpl( + std::optional format, + const String & uri, + CompressionMethod compression_method, + const HTTPHeaderEntries & headers, + const std::optional & format_settings, + const ContextPtr & context); + virtual Block getHeaderBlock(const Names & column_names, const StorageSnapshotPtr & storage_snapshot) const = 0; }; @@ -160,7 +176,7 @@ public: const String & format, const std::optional & format_settings, String name_, - ContextPtr context, + const ContextPtr & context, UInt64 max_block_size, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, @@ -231,7 +247,7 @@ public: const String & format, const std::optional & format_settings, const Block & sample_block, - ContextPtr context, + const ContextPtr & context, const ConnectionTimeouts & timeouts, CompressionMethod compression_method, const HTTPHeaderEntries & headers = {}, @@ -263,7 +279,7 @@ public: const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const String & comment, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_, const HTTPHeaderEntries & headers_ = {}, const String & method_ = "", @@ -292,12 +308,12 @@ public: std::string addresses_expr; }; - static Configuration getConfiguration(ASTs & args, ContextPtr context); + static Configuration getConfiguration(ASTs & args, const ContextPtr & context); /// Does evaluateConstantExpressionOrIdentifierAsLiteral() on all arguments. /// If `headers(...)` argument is present, parses it and moves it to the end of the array. /// Returns number of arguments excluding `headers(...)`. - static size_t evalArgsAndCollectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, ContextPtr context); + static size_t evalArgsAndCollectHeaders(ASTs & url_function_args, HTTPHeaderEntries & header_entries, const ContextPtr & context); static void processNamedCollectionResult(Configuration & configuration, const NamedCollection & collection); }; @@ -314,7 +330,7 @@ public: const std::optional & format_settings_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - ContextPtr context_, + const ContextPtr & context_, const String & compression_method_); void read( diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index a0b5fcd6f28..d71dfea7693 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -35,36 +35,43 @@ namespace ErrorCodes } StorageURLCluster::StorageURLCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & uri_, const String & format_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const StorageURL::Configuration & configuration_, - bool structure_argument_was_provided_) - : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageURLCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) - , uri(uri_) + const StorageURL::Configuration & configuration_) + : IStorageCluster(cluster_name_, table_id_, &Poco::Logger::get("StorageURLCluster (" + table_id_.table_name + ")")) + , uri(uri_), format_name(format_) { - context_->getRemoteHostFilter().checkURL(Poco::URI(uri)); - context_->getHTTPHeaderFilter().checkHeaders(configuration_.headers); + context->getRemoteHostFilter().checkURL(Poco::URI(uri)); + context->getHTTPHeaderFilter().checkHeaders(configuration_.headers); StorageInMemoryMetadata storage_metadata; if (columns_.empty()) { - auto columns = StorageURL::getTableStructureFromData(format_, - uri, - chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method_), - configuration_.headers, - std::nullopt, - context_); + ColumnsDescription columns; + if (format_name == "auto") + std::tie(columns, format_name) = StorageURL::getTableStructureAndFormatFromData( + uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context); + else + columns = StorageURL::getTableStructureFromData( + format_, uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context); + storage_metadata.setColumns(columns); } else + { + if (format_name == "auto") + format_name = StorageURL::getTableStructureAndFormatFromData( + uri, chooseCompressionMethod(Poco::URI(uri).getPath(), compression_method), configuration_.headers, std::nullopt, context).second; + storage_metadata.setColumns(columns_); + } storage_metadata.setConstraints(constraints_); setInMemoryMetadata(storage_metadata); @@ -72,13 +79,14 @@ StorageURLCluster::StorageURLCluster( virtual_columns = VirtualColumnUtils::getPathFileAndSizeVirtualsForStorage(storage_metadata.getSampleBlock().getNamesAndTypesList()); } -void StorageURLCluster::addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) +void StorageURLCluster::updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) { ASTExpressionList * expression_list = extractTableFunctionArgumentsFromSelectQuery(query); if (!expression_list) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected SELECT query from table function urlCluster, got '{}'", queryToString(query)); - TableFunctionURLCluster::addColumnsStructureToArguments(expression_list->children, structure, context); + TableFunctionURLCluster::updateStructureAndFormatArgumentsIfNeeded( + expression_list->children, storage_snapshot->metadata->getColumns().getAll().toNamesAndTypesDescription(), format_name, context); } RemoteQueryExecutor::Extension StorageURLCluster::getTaskIteratorExtension(const ActionsDAG::Node * predicate, const ContextPtr & context) const diff --git a/src/Storages/StorageURLCluster.h b/src/Storages/StorageURLCluster.h index 07978040029..f57d262f434 100644 --- a/src/Storages/StorageURLCluster.h +++ b/src/Storages/StorageURLCluster.h @@ -19,16 +19,15 @@ class StorageURLCluster : public IStorageCluster { public: StorageURLCluster( - ContextPtr context_, + const ContextPtr & context, const String & cluster_name_, const String & uri_, const String & format_, - const String & compression_method_, + const String & compression_method, const StorageID & table_id_, const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, - const StorageURL::Configuration & configuration_, - bool structure_argument_was_provided_); + const StorageURL::Configuration & configuration_); std::string getName() const override { return "URLCluster"; } @@ -41,11 +40,10 @@ public: bool supportsTrivialCountOptimization() const override { return true; } private: - void addColumnsStructureToQuery(ASTPtr & query, const String & structure, const ContextPtr & context) override; + void updateQueryToSendIfNeeded(ASTPtr & query, const StorageSnapshotPtr & storage_snapshot, const ContextPtr & context) override; String uri; String format_name; - String compression_method; NamesAndTypesList virtual_columns; }; diff --git a/src/Storages/StorageXDBC.cpp b/src/Storages/StorageXDBC.cpp index a274b1ba4db..c01d0310952 100644 --- a/src/Storages/StorageXDBC.cpp +++ b/src/Storages/StorageXDBC.cpp @@ -59,7 +59,7 @@ std::vector> StorageXDBC::getReadURIParams( const Names & /* column_names */, const StorageSnapshotPtr & /*storage_snapshot*/, const SelectQueryInfo & /*query_info*/, - ContextPtr /*context*/, + const ContextPtr & /*context*/, QueryProcessingStage::Enum & /*processed_stage*/, size_t max_block_size) const { @@ -70,7 +70,7 @@ std::function StorageXDBC::getReadPOSTDataCallback( const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr local_context, + const ContextPtr & local_context, QueryProcessingStage::Enum & /*processed_stage*/, size_t /*max_block_size*/) const { diff --git a/src/Storages/StorageXDBC.h b/src/Storages/StorageXDBC.h index fe678785dc2..9a0a9b5afa1 100644 --- a/src/Storages/StorageXDBC.h +++ b/src/Storages/StorageXDBC.h @@ -55,7 +55,7 @@ private: const Names & column_names, const StorageSnapshotPtr & storage_snapshot, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; @@ -63,7 +63,7 @@ private: const Names & column_names, const ColumnsDescription & columns_description, const SelectQueryInfo & query_info, - ContextPtr context, + const ContextPtr & context, QueryProcessingStage::Enum & processed_stage, size_t max_block_size) const override; diff --git a/src/TableFunctions/ITableFunctionCluster.h b/src/TableFunctions/ITableFunctionCluster.h index 7e81d6d21b7..9f56d781bc9 100644 --- a/src/TableFunctions/ITableFunctionCluster.h +++ b/src/TableFunctions/ITableFunctionCluster.h @@ -4,7 +4,6 @@ #include #include -#include #include #include #include @@ -29,14 +28,14 @@ public: String getName() const override = 0; String getSignature() const override = 0; - static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context) + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context) { if (args.empty()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected empty list of arguments for {}Cluster table function", Base::name); ASTPtr cluster_name_arg = args.front(); args.erase(args.begin()); - Base::addColumnsStructureToArguments(args, desired_structure, context); + Base::updateStructureAndFormatArgumentsIfNeeded(args, structure_, format_, context); args.insert(args.begin(), cluster_name_arg); } diff --git a/src/TableFunctions/ITableFunctionFileLike.cpp b/src/TableFunctions/ITableFunctionFileLike.cpp index b88af855309..b697f3df925 100644 --- a/src/TableFunctions/ITableFunctionFileLike.cpp +++ b/src/TableFunctions/ITableFunctionFileLike.cpp @@ -27,14 +27,14 @@ void ITableFunctionFileLike::parseFirstArguments(const ASTPtr & arg, const Conte filename = checkAndGetLiteralArgument(arg, "source"); } -String ITableFunctionFileLike::getFormatFromFirstArgument() +std::optional ITableFunctionFileLike::tryGetFormatFromFirstArgument() { - return FormatFactory::instance().getFormatFromFileName(filename, true); + return FormatFactory::instance().tryGetFormatFromFileName(filename); } bool ITableFunctionFileLike::supportsReadingSubsetOfColumns(const ContextPtr & context) { - return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context); + return format != "auto" && FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format, context); } void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -63,7 +63,10 @@ void ITableFunctionFileLike::parseArgumentsImpl(ASTs & args, const ContextPtr & format = checkAndGetLiteralArgument(args[1], "format"); if (format == "auto") - format = getFormatFromFirstArgument(); + { + if (auto format_from_first_argument = tryGetFormatFromFirstArgument()) + format = *format_from_first_argument; + } if (args.size() > 2) { @@ -79,34 +82,37 @@ void ITableFunctionFileLike::parseArgumentsImpl(ASTs & args, const ContextPtr & compression_method = checkAndGetLiteralArgument(args[3], "compression_method"); } -void ITableFunctionFileLike::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr &) +void ITableFunctionFileLike::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { if (args.empty() || args.size() > getMaxNumberOfArguments()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), args.size()); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + /// f(filename) if (args.size() == 1) { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } /// f(filename, format) else if (args.size() == 2) { + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } - /// f(filename, format, 'auto') - else if (args.size() == 3) + /// f(filename, format, structure) or f(filename, format, structure, compression) + else if (args.size() >= 3) { - args.back() = structure_literal; - } - /// f(filename, format, 'auto', compression) - else if (args.size() == 4) - { - args[args.size() - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[1], "format") == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } } diff --git a/src/TableFunctions/ITableFunctionFileLike.h b/src/TableFunctions/ITableFunctionFileLike.h index 5fe86587797..b378f2f3a6c 100644 --- a/src/TableFunctions/ITableFunctionFileLike.h +++ b/src/TableFunctions/ITableFunctionFileLike.h @@ -31,7 +31,7 @@ public: static size_t getMaxNumberOfArguments() { return 4; } - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr &); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr &); protected: @@ -39,7 +39,7 @@ protected: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); virtual void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context); - virtual String getFormatFromFirstArgument(); + virtual std::optional tryGetFormatFromFirstArgument(); String filename; String path_to_archive; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index d394c836369..b9e0af53b7b 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -58,7 +58,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); } else { @@ -155,7 +155,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } } @@ -174,15 +174,24 @@ void TableFunctionAzureBlobStorage::parseArguments(const ASTPtr & ast_function, parseArgumentsImpl(args, context); } -void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionAzureBlobStorage::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -191,65 +200,126 @@ void TableFunctionAzureBlobStorage::addColumnsStructureToArguments(ASTs & args, "Storage Azure requires 3 to 7 arguments: " "AzureBlobStorage(connection_string|storage_account_url, container_name, blobpath, [account_name, account_key, format, compression, structure])"); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); + for (auto & arg : args) + arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); + auto is_format_arg = [](const std::string & s) -> bool { return s == "auto" || FormatFactory::instance().getAllFormats().contains(s); }; - + /// (connection_string, container_name, blobpath) if (args.size() == 3) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + /// Add compression = "auto" before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (connection_string, container_name, blobpath, structure) or + /// (connection_string, container_name, blobpath, format) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 4) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name/structure"); + /// (..., format) -> (..., format, compression, structure) if (is_format_arg(fourth_arg)) { + if (fourth_arg == "auto") + args[3] = format_literal; /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + /// (..., structure) -> (..., format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[3] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (connection_string, container_name, blobpath, format, compression) or + /// (storage_account_url, container_name, blobpath, account_name, account_key) + /// We can distinguish them by looking at the 4-th argument: check if it's format name or not. else if (args.size() == 5) { auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., format, compression) -> (..., format, compression, structure) + if (is_format_arg(fourth_arg)) { - /// Add format=auto & compression=auto before structure argument. - args.push_back(std::make_shared("auto")); - args.push_back(std::make_shared("auto")); + if (fourth_arg == "auto") + args[3] = format_literal; + args.push_back(structure_literal); } - args.push_back(structure_literal); - } - else if (args.size() == 6) - { - auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); - if (!is_format_arg(fourth_arg)) + /// (..., account_name, account_key) -> (..., account_name, account_key, format, compression, structure) + else { + args.push_back(format_literal); /// Add compression=auto before structure argument. args.push_back(std::make_shared("auto")); args.push_back(structure_literal); } + } + /// (connection_string, container_name, blobpath, format, compression, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, structure) or + /// (storage_account_url, container_name, blobpath, account_name, account_key, format) + else if (args.size() == 6) + { + auto fourth_arg = checkAndGetLiteralArgument(args[3], "format/account_name"); + auto sixth_arg = checkAndGetLiteralArgument(args[5], "format/structure"); + + /// (..., format, compression, structure) + if (is_format_arg(fourth_arg)) + { + if (fourth_arg == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[5], "structure") == "auto") + args[5] = structure_literal; + } + /// (..., account_name, account_key, format) -> (..., account_name, account_key, format, compression, structure) + else if (is_format_arg(sixth_arg)) + { + if (sixth_arg == "auto") + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + args.push_back(structure_literal); + } + /// (..., account_name, account_key, structure) -> (..., account_name, account_key, format, compression, structure) else { - args.back() = structure_literal; + auto structure_arg = args.back(); + args[5] = format_literal; + /// Add compression=auto before structure argument. + args.push_back(std::make_shared("auto")); + if (sixth_arg == "auto") + args.push_back(structure_literal); + else + args.push_back(structure_arg); } } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression) else if (args.size() == 7) { + /// (..., format, compression) -> (..., format, compression, structure) + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; args.push_back(structure_literal); } + /// (storage_account_url, container_name, blobpath, account_name, account_key, format, compression, structure) else if (args.size() == 8) { - args.back() = structure_literal; + if (checkAndGetLiteralArgument(args[5], "format") == "auto") + args[5] = format_literal; + if (checkAndGetLiteralArgument(args[7], "structure") == "auto") + args[7] = structure_literal; } } } @@ -263,7 +333,9 @@ ColumnsDescription TableFunctionAzureBlobStorage::getActualTableStructure(Contex auto settings = StorageAzureBlob::createSettings(context); auto object_storage = std::make_unique("AzureBlobStorageTableFunction", std::move(client), std::move(settings)); - return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context, false); + if (configuration.format == "auto") + return StorageAzureBlob::getTableStructureAndFormatFromData(object_storage.get(), configuration, std::nullopt, context).first; + return StorageAzureBlob::getTableStructureFromData(object_storage.get(), configuration, std::nullopt, context); } return parseColumnsListFromString(configuration.structure, context); diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.h b/src/TableFunctions/TableFunctionAzureBlobStorage.h index 1a221f60c55..9622881b417 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.h +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.h @@ -55,7 +55,7 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context); protected: diff --git a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp index eee585967c2..a2221cf35b6 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorageCluster.cpp @@ -21,9 +21,8 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( { StoragePtr storage; ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - if (structure_argument_was_provided) + if (configuration.structure != "auto") { columns = parseColumnsListFromString(configuration.structure, context); } @@ -59,8 +58,7 @@ StoragePtr TableFunctionAzureBlobStorageCluster::executeImpl( StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - context, - structure_argument_was_provided); + context); } storage->startup(); diff --git a/src/TableFunctions/TableFunctionFile.cpp b/src/TableFunctions/TableFunctionFile.cpp index 8a9dde374ec..b481076e9b6 100644 --- a/src/TableFunctions/TableFunctionFile.cpp +++ b/src/TableFunctions/TableFunctionFile.cpp @@ -54,12 +54,12 @@ void TableFunctionFile::parseFirstArguments(const ASTPtr & arg, const ContextPtr throw Exception(ErrorCodes::BAD_ARGUMENTS, "The first argument of table function '{}' mush be path or file descriptor", getName()); } -String TableFunctionFile::getFormatFromFirstArgument() +std::optional TableFunctionFile::tryGetFormatFromFirstArgument() { if (fd >= 0) - return FormatFactory::instance().getFormatFromFileDescriptor(fd); + return FormatFactory::instance().tryGetFormatFromFileDescriptor(fd); else - return FormatFactory::instance().getFormatFromFileName(filename, true); + return FormatFactory::instance().tryGetFormatFromFileName(filename); } StoragePtr TableFunctionFile::getStorage(const String & source, @@ -104,10 +104,11 @@ ColumnsDescription TableFunctionFile::getActualTableStructure(ContextPtr context archive_info = StorageFile::getArchiveInfo(path_to_archive, filename, context->getUserFilesPath(), context, total_bytes_to_read); + if (format == "auto") + return StorageFile::getTableStructureAndFormatFromFile(paths, compression_method, std::nullopt, context, archive_info).first; return StorageFile::getTableStructureFromFile(format, paths, compression_method, std::nullopt, context, archive_info); } - return parseColumnsListFromString(structure, context); } diff --git a/src/TableFunctions/TableFunctionFile.h b/src/TableFunctions/TableFunctionFile.h index 6eaab29db8a..1347284753e 100644 --- a/src/TableFunctions/TableFunctionFile.h +++ b/src/TableFunctions/TableFunctionFile.h @@ -27,7 +27,7 @@ public: protected: int fd = -1; void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context) override; - String getFormatFromFirstArgument() override; + std::optional tryGetFormatFromFirstArgument() override; private: StoragePtr getStorage( diff --git a/src/TableFunctions/TableFunctionFileCluster.cpp b/src/TableFunctions/TableFunctionFileCluster.cpp index 843909e2a58..3e53349b022 100644 --- a/src/TableFunctions/TableFunctionFileCluster.cpp +++ b/src/TableFunctions/TableFunctionFileCluster.cpp @@ -43,8 +43,7 @@ StoragePtr TableFunctionFileCluster::getStorage( compression_method, StorageID(getDatabaseName(), table_name), columns, - ConstraintsDescription{}, - structure != "auto"); + ConstraintsDescription{}); } return storage; diff --git a/src/TableFunctions/TableFunctionFormat.cpp b/src/TableFunctions/TableFunctionFormat.cpp index 4b6d0f70c0a..ad2a142a140 100644 --- a/src/TableFunctions/TableFunctionFormat.cpp +++ b/src/TableFunctions/TableFunctionFormat.cpp @@ -33,7 +33,9 @@ namespace ErrorCodes namespace { -/* format(format_name, data) - ... +/* format(format_name, structure, data) - parses data according to the specified format and structure. + * format(format_name, data) - infers the schema from the data and parses it according to the specified format. + * format(data) - detects the format, infers the schema and parses data according to inferred format and structure. */ class TableFunctionFormat : public ITableFunction { @@ -49,11 +51,11 @@ private: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; void parseArguments(const ASTPtr & ast_function, ContextPtr context) override; - Block parseData(ColumnsDescription columns, ContextPtr context) const; + Block parseData(const ColumnsDescription & columns, const String & format_name, const ContextPtr & context) const; - String format; - String data; + String format = "auto"; String structure = "auto"; + String data; }; void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr context) @@ -65,14 +67,15 @@ void TableFunctionFormat::parseArguments(const ASTPtr & ast_function, ContextPtr ASTs & args = args_func.at(0)->children; - if (args.size() != 2 && args.size() != 3) - throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires 2 or 3 arguments: format, [structure], data", getName()); + if (args.empty() || args.size() > 3) + throw Exception(ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH, "Table function '{}' requires from 1 to 3 arguments: [format, [structure]], data", getName()); for (auto & arg : args) arg = evaluateConstantExpressionOrIdentifierAsLiteral(arg, context); - format = checkAndGetLiteralArgument(args[0], "format"); data = checkAndGetLiteralArgument(args.back(), "data"); + if (args.size() > 1) + format = checkAndGetLiteralArgument(args[0], "format"); if (args.size() == 3) structure = checkAndGetLiteralArgument(args[1], "structure"); } @@ -82,19 +85,21 @@ ColumnsDescription TableFunctionFormat::getActualTableStructure(ContextPtr conte if (structure == "auto") { SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); - return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, false, context); + if (format == "auto") + return detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context).first; + return readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); } return parseColumnsListFromString(structure, context); } -Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr context) const +Block TableFunctionFormat::parseData(const ColumnsDescription & columns, const String & format_name, const ContextPtr & context) const { Block block; for (const auto & name_and_type : columns.getAllPhysical()) block.insert({name_and_type.type->createColumn(), name_and_type.type, name_and_type.name}); auto read_buf = std::make_unique(data); - auto input_format = context->getInputFormat(format, *read_buf, block, context->getSettingsRef().max_block_size); + auto input_format = context->getInputFormat(format_name, *read_buf, block, context->getSettingsRef().max_block_size); QueryPipelineBuilder builder; builder.init(Pipe(input_format)); if (columns.hasDefaults()) @@ -120,10 +125,24 @@ Block TableFunctionFormat::parseData(ColumnsDescription columns, ContextPtr cont return concatenateBlocks(blocks); } -StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool is_insert_query) const +StoragePtr TableFunctionFormat::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/, bool /*is_insert_query*/) const { - auto columns = getActualTableStructure(context, is_insert_query); - Block res_block = parseData(columns, context); + ColumnsDescription columns; + String format_name = format; + if (structure == "auto") + { + SingleReadBufferIterator read_buffer_iterator(std::make_unique(data)); + if (format_name == "auto") + std::tie(columns, format_name) = detectFormatAndReadSchema(std::nullopt, read_buffer_iterator, context); + else + columns = readSchemaFromFormat(format, std::nullopt, read_buffer_iterator, context); + } + else + { + columns = parseColumnsListFromString(structure, context); + } + + Block res_block = parseData(columns, format_name, context); auto res = std::make_shared(StorageID(getDatabaseName(), table_name), columns, res_block); res->startup(); return res; diff --git a/src/TableFunctions/TableFunctionHDFS.cpp b/src/TableFunctions/TableFunctionHDFS.cpp index 8d48a7ba30e..2dac4398144 100644 --- a/src/TableFunctions/TableFunctionHDFS.cpp +++ b/src/TableFunctions/TableFunctionHDFS.cpp @@ -33,6 +33,8 @@ ColumnsDescription TableFunctionHDFS::getActualTableStructure(ContextPtr context if (structure == "auto") { context->checkAccess(getSourceAccessType()); + if (format == "auto") + return StorageHDFS::getTableStructureAndFormatFromData(filename, compression_method, context).first; return StorageHDFS::getTableStructureFromData(format, filename, compression_method, context); } diff --git a/src/TableFunctions/TableFunctionHDFSCluster.cpp b/src/TableFunctions/TableFunctionHDFSCluster.cpp index 6fb7ed0fce5..57ce6d2b9ff 100644 --- a/src/TableFunctions/TableFunctionHDFSCluster.cpp +++ b/src/TableFunctions/TableFunctionHDFSCluster.cpp @@ -45,8 +45,7 @@ StoragePtr TableFunctionHDFSCluster::getStorage( format, columns, ConstraintsDescription{}, - compression_method, - structure != "auto"); + compression_method); } return storage; } diff --git a/src/TableFunctions/TableFunctionS3.cpp b/src/TableFunctions/TableFunctionS3.cpp index a9c5a5c99f0..3fedd38277c 100644 --- a/src/TableFunctions/TableFunctionS3.cpp +++ b/src/TableFunctions/TableFunctionS3.cpp @@ -61,12 +61,11 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context if (configuration.format == "auto") { String file_path = named_collection->getOrDefault("filename", Poco::URI(named_collection->get("url")).getPath()); - configuration.format = FormatFactory::instance().getFormatFromFileName(file_path, true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(file_path).value_or("auto"); } } else { - size_t count = StorageURL::evalArgsAndCollectHeaders(args, configuration.headers_from_ast, context); if (count == 0 || count > 7) @@ -216,7 +215,7 @@ void TableFunctionS3::parseArgumentsImpl(ASTs & args, const ContextPtr & context configuration.auth_settings.no_sign_request = no_sign_request; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(Poco::URI(url).getPath(), true); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(url).getPath()).value_or("auto"); } configuration.keys = {configuration.url.key}; @@ -238,15 +237,24 @@ void TableFunctionS3::parseArguments(const ASTPtr & ast_function, ContextPtr con parseArgumentsImpl(args, context); } -void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context) +void TableFunctionS3::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -256,23 +264,25 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & if (count == 0 || count > getMaxNumberOfArguments()) throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected 1 to {} arguments in table function, got {}", getMaxNumberOfArguments(), count); + auto format_literal = std::make_shared(format); auto structure_literal = std::make_shared(structure); - /// s3(s3_url) + /// s3(s3_url) -> s3(s3_url, format, structure) if (count == 1) { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } - /// s3(s3_url, format) or s3(s3_url, NOSIGN) + /// s3(s3_url, format) -> s3(s3_url, format, structure) or + /// s3(s3_url, NOSIGN) -> s3(s3_url, NOSIGN, format, structure) /// We can distinguish them by looking at the 2-nd argument: check if it's NOSIGN or not. else if (count == 2) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); - /// If there is NOSIGN, add format=auto before structure. if (boost::iequals(second_arg, "NOSIGN")) - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); + else if (second_arg == "auto") + args.back() = format_literal; args.push_back(structure_literal); } /// s3(source, format, structure) or @@ -282,18 +292,25 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 3) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format) -> s3(source, NOSIGN, format, structure) if (boost::iequals(second_arg, "NOSIGN")) { + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args.back() = format_literal; args.push_back(structure_literal); } + /// s3(source, format, structure) else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) { - args[count - 1] = structure_literal; + if (second_arg == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } + /// s3(source, access_key_id, access_key_id) -> s3(source, access_key_id, access_key_id, format, structure) else { - /// Add format=auto before structure argument. - args.push_back(std::make_shared("auto")); + args.push_back(format_literal); args.push_back(structure_literal); } } @@ -304,16 +321,27 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 4) { auto second_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format, structure) if (boost::iequals(second_arg, "NOSIGN")) { - args[count - 1] = structure_literal; + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args[2] = format_literal; + if (checkAndGetLiteralArgument(args[3], "structure") == "auto") + args[3] = structure_literal; } + /// s3(source, format, structure, compression_method) else if (second_arg == "auto" || FormatFactory::instance().getAllFormats().contains(second_arg)) { - args[count - 2] = structure_literal; + if (second_arg == "auto") + args[1] = format_literal; + if (checkAndGetLiteralArgument(args[2], "structure") == "auto") + args[2] = structure_literal; } + /// s3(source, access_key_id, access_key_id, format) -> s3(source, access_key_id, access_key_id, format, structure) else { + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; args.push_back(structure_literal); } } @@ -323,19 +351,30 @@ void TableFunctionS3::addColumnsStructureToArguments(ASTs & args, const String & else if (count == 5) { auto sedond_arg = checkAndGetLiteralArgument(args[1], "format/NOSIGN"); + /// s3(source, NOSIGN, format, structure, compression_method) if (boost::iequals(sedond_arg, "NOSIGN")) { - args[count - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[2], "format") == "auto") + args[2] = format_literal; + if (checkAndGetLiteralArgument(args[3], "structure") == "auto") + args[3] = structure_literal; } + /// s3(source, access_key_id, access_key_id, format, structure) else { - args[count - 1] = structure_literal; + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[4], "structure") == "auto") + args[4] = structure_literal; } } /// s3(source, access_key_id, secret_access_key, format, structure, compression) else if (count == 6) { - args[count - 2] = structure_literal; + if (checkAndGetLiteralArgument(args[3], "format") == "auto") + args[3] = format_literal; + if (checkAndGetLiteralArgument(args[4], "structure") == "auto") + args[4] = structure_literal; } } } @@ -346,6 +385,9 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context, { context->checkAccess(getSourceAccessType()); configuration.update(context); + if (configuration.format == "auto") + return StorageS3::getTableStructureAndFormatFromData(configuration, std::nullopt, context).first; + return StorageS3::getTableStructureFromData(configuration, std::nullopt, context); } diff --git a/src/TableFunctions/TableFunctionS3.h b/src/TableFunctions/TableFunctionS3.h index fa73c1d313e..00ca36c6653 100644 --- a/src/TableFunctions/TableFunctionS3.h +++ b/src/TableFunctions/TableFunctionS3.h @@ -57,7 +57,7 @@ public: virtual void parseArgumentsImpl(ASTs & args, const ContextPtr & context); - static void addColumnsStructureToArguments(ASTs & args, const String & structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure, const String & format, const ContextPtr & context); protected: diff --git a/src/TableFunctions/TableFunctionS3Cluster.cpp b/src/TableFunctions/TableFunctionS3Cluster.cpp index ce96f7f580b..e727c4e4c89 100644 --- a/src/TableFunctions/TableFunctionS3Cluster.cpp +++ b/src/TableFunctions/TableFunctionS3Cluster.cpp @@ -21,9 +21,8 @@ StoragePtr TableFunctionS3Cluster::executeImpl( { StoragePtr storage; ColumnsDescription columns; - bool structure_argument_was_provided = configuration.structure != "auto"; - if (structure_argument_was_provided) + if (configuration.structure != "auto") { columns = parseColumnsListFromString(configuration.structure, context); } @@ -53,8 +52,7 @@ StoragePtr TableFunctionS3Cluster::executeImpl( StorageID(getDatabaseName(), table_name), columns, ConstraintsDescription{}, - context, - structure_argument_was_provided); + context); } storage->startup(); diff --git a/src/TableFunctions/TableFunctionURL.cpp b/src/TableFunctions/TableFunctionURL.cpp index aa535991d65..a78b2affa9a 100644 --- a/src/TableFunctions/TableFunctionURL.cpp +++ b/src/TableFunctions/TableFunctionURL.cpp @@ -55,7 +55,7 @@ void TableFunctionURL::parseArgumentsImpl(ASTs & args, const ContextPtr & contex format = configuration.format; if (format == "auto") - format = FormatFactory::instance().getFormatFromFileName(Poco::URI(filename).getPath(), true); + format = FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(filename).getPath()).value_or("auto"); StorageURL::evalArgsAndCollectHeaders(args, configuration.headers, context); } @@ -78,15 +78,24 @@ void TableFunctionURL::parseArgumentsImpl(ASTs & args, const ContextPtr & contex } } -void TableFunctionURL::addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context) +void TableFunctionURL::updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context) { - if (tryGetNamedCollectionWithOverrides(args, context)) + if (auto collection = tryGetNamedCollectionWithOverrides(args, context)) { - /// In case of named collection, just add key-value pair "structure='...'" - /// at the end of arguments to override existed structure. - ASTs equal_func_args = {std::make_shared("structure"), std::make_shared(desired_structure)}; - auto equal_func = makeASTFunction("equals", std::move(equal_func_args)); - args.push_back(equal_func); + /// In case of named collection, just add key-value pairs "format='...', structure='...'" + /// at the end of arguments to override existed format and structure with "auto" values. + if (collection->getOrDefault("format", "auto") == "auto") + { + ASTs format_equal_func_args = {std::make_shared("format"), std::make_shared(format_)}; + auto format_equal_func = makeASTFunction("equals", std::move(format_equal_func_args)); + args.push_back(format_equal_func); + } + if (collection->getOrDefault("structure", "auto") == "auto") + { + ASTs structure_equal_func_args = {std::make_shared("structure"), std::make_shared(structure_)}; + auto structure_equal_func = makeASTFunction("equals", std::move(structure_equal_func_args)); + args.push_back(structure_equal_func); + } } else { @@ -101,7 +110,7 @@ void TableFunctionURL::addColumnsStructureToArguments(ASTs & args, const String args.pop_back(); } - ITableFunctionFileLike::addColumnsStructureToArguments(args, desired_structure, context); + ITableFunctionFileLike::updateStructureAndFormatArgumentsIfNeeded(args, structure_, format_, context); if (headers_ast) args.push_back(headers_ast); @@ -131,6 +140,14 @@ ColumnsDescription TableFunctionURL::getActualTableStructure(ContextPtr context, if (structure == "auto") { context->checkAccess(getSourceAccessType()); + if (format == "auto") + return StorageURL::getTableStructureAndFormatFromData( + filename, + chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), + configuration.headers, + std::nullopt, + context).first; + return StorageURL::getTableStructureFromData(format, filename, chooseCompressionMethod(Poco::URI(filename).getPath(), compression_method), @@ -148,9 +165,9 @@ std::unordered_set TableFunctionURL::getVirtualsToCheckBeforeUsingStruct return {virtual_column_names.begin(), virtual_column_names.end()}; } -String TableFunctionURL::getFormatFromFirstArgument() +std::optional TableFunctionURL::tryGetFormatFromFirstArgument() { - return FormatFactory::instance().getFormatFromFileName(Poco::URI(filename).getPath(), true); + return FormatFactory::instance().tryGetFormatFromFileName(Poco::URI(filename).getPath()); } void registerTableFunctionURL(TableFunctionFactory & factory) diff --git a/src/TableFunctions/TableFunctionURL.h b/src/TableFunctions/TableFunctionURL.h index bf417f950c0..54e223283ba 100644 --- a/src/TableFunctions/TableFunctionURL.h +++ b/src/TableFunctions/TableFunctionURL.h @@ -34,7 +34,7 @@ public: ColumnsDescription getActualTableStructure(ContextPtr context, bool is_insert_query) const override; - static void addColumnsStructureToArguments(ASTs & args, const String & desired_structure, const ContextPtr & context); + static void updateStructureAndFormatArgumentsIfNeeded(ASTs & args, const String & structure_, const String & format_, const ContextPtr & context); std::unordered_set getVirtualsToCheckBeforeUsingStructureHint() const override; @@ -53,8 +53,7 @@ private: const char * getStorageTypeName() const override { return "URL"; } - String getFormatFromFirstArgument() override; - + std::optional tryGetFormatFromFirstArgument() override; }; } diff --git a/src/TableFunctions/TableFunctionURLCluster.cpp b/src/TableFunctions/TableFunctionURLCluster.cpp index a2949278155..5fd3c3342a5 100644 --- a/src/TableFunctions/TableFunctionURLCluster.cpp +++ b/src/TableFunctions/TableFunctionURLCluster.cpp @@ -40,8 +40,7 @@ StoragePtr TableFunctionURLCluster::getStorage( StorageID(getDatabaseName(), table_name), getActualTableStructure(context, /* is_insert_query */ true), ConstraintsDescription{}, - configuration, - structure != "auto"); + configuration); } return storage; } diff --git a/tests/integration/test_file_cluster/test.py b/tests/integration/test_file_cluster/test.py index d75cd6c7d23..5d12407e3f2 100644 --- a/tests/integration/test_file_cluster/test.py +++ b/tests/integration/test_file_cluster/test.py @@ -123,3 +123,91 @@ def test_no_such_files(started_cluster): distributed = node.query(get_query("*", True, "3,4")) assert TSV(local) == TSV(distributed) + + +def test_schema_inference(started_cluster): + node = started_cluster.instances["s0_0_0"] + + expected_result = node.query( + "select * from file('file*.csv', 'CSV', 's String, i UInt32') ORDER BY (i, s)" + ) + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv') ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', CSV) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', CSV, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', auto, auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file*.csv', CSV, auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + +def test_format_detection(started_cluster): + for node_name in ("s0_0_0", "s0_0_1", "s0_1_0"): + for i in range(1, 3): + started_cluster.instances[node_name].query( + f""" + INSERT INTO TABLE FUNCTION file( + 'file_for_format_detection_{i}', 'CSV', 's String, i UInt32') VALUES ('file{i}',{i}) + """ + ) + + node = started_cluster.instances["s0_0_0"] + expected_result = node.query( + "select * from file('file_for_format_detection*', 'CSV', 's String, i UInt32') ORDER BY (i, s)" + ) + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*') ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32') ORDER BY (i, s)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, auto, auto) ORDER BY (c1, c2)" + ) + assert result == expected_result + + result = node.query( + "select * from fileCluster('my_cluster', 'file_for_format_detection*', auto, 's String, i UInt32', auto) ORDER BY (i, s)" + ) + assert result == expected_result diff --git a/tests/integration/test_s3_cluster/test.py b/tests/integration/test_s3_cluster/test.py index 673ca318c92..03919ee6a4d 100644 --- a/tests/integration/test_s3_cluster/test.py +++ b/tests/integration/test_s3_cluster/test.py @@ -35,7 +35,9 @@ def create_buckets_s3(cluster): # Make all files a bit different for number in range(100 + file_number): - data.append([str(number + file_number) * 10, number + file_number]) + data.append( + ["str_" + str(number + file_number) * 10, number + file_number] + ) writer = csv.writer(f) writer.writerows(data) @@ -427,3 +429,33 @@ def test_cluster_with_named_collection(started_cluster): ) assert TSV(pure_s3) == TSV(s3_cluster) + + +def test_cluster_format_detection(started_cluster): + node = started_cluster.instances["s0_0_0"] + + expected_desc_result = node.query( + "desc s3('http://minio1:9001/root/data/generated/*', 'minio', 'minio123', 'CSV')" + ) + + desc_result = node.query( + "desc s3('http://minio1:9001/root/data/generated/*', 'minio', 'minio123')" + ) + + assert expected_desc_result == desc_result + + expected_result = node.query( + "SELECT * FROM s3('http://minio1:9001/root/data/generated/*', 'minio', 'minio123', 'CSV', 'a String, b UInt64') order by a, b" + ) + + result = node.query( + "SELECT * FROM s3Cluster(cluster_simple, 'http://minio1:9001/root/data/generated/*', 'minio', 'minio123') order by c1, c2" + ) + + assert result == expected_result + + result = node.query( + "SELECT * FROM s3Cluster(cluster_simple, 'http://minio1:9001/root/data/generated/*', 'minio', 'minio123', auto, 'a String, b UInt64') order by a, b" + ) + + assert result == expected_result diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 3cccd07c134..75ef50ec12a 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -1250,3 +1250,73 @@ def test_size_virtual_column(cluster): result == "test_size_virtual_column1.tsv\t2\ntest_size_virtual_column2.tsv\t3\ntest_size_virtual_column3.tsv\t4\n" ) + + +def test_format_detection(cluster): + node = cluster.instances["node"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + account_name = "devstoreaccount1" + account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String') select number as x, 'str_' || toString(number) from numbers(0)", + ) + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String') select number as x, 'str_' || toString(number) from numbers(10)", + ) + + expected_desc_result = azure_query( + node, + f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'auto')", + ) + + desc_result = azure_query( + node, + f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}')", + ) + + assert expected_desc_result == desc_result + + expected_result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt64, y String')", + ) + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}')", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', auto, auto, 'x UInt64, y String')", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection{{0,1}}', '{account_name}', '{account_key}')", + ) + + assert result == expected_result + + node.query(f"system drop schema cache for hdfs") + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection{{0,1}}', '{account_name}', '{account_key}')", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection{{0,1}}', '{account_name}', '{account_key}')", + ) + + assert result == expected_result diff --git a/tests/integration/test_storage_azure_blob_storage/test_cluster.py b/tests/integration/test_storage_azure_blob_storage/test_cluster.py index 2bd3f24d25f..6c5e2d20ca5 100644 --- a/tests/integration/test_storage_azure_blob_storage/test_cluster.py +++ b/tests/integration/test_storage_azure_blob_storage/test_cluster.py @@ -262,3 +262,72 @@ def test_partition_parallel_reading_with_cluster(cluster): ) assert azure_cluster == "3\n" + + +def test_format_detection(cluster): + node = cluster.instances["node_0"] + storage_account_url = cluster.env_variables["AZURITE_STORAGE_ACCOUNT_URL"] + account_name = "devstoreaccount1" + account_key = "Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==" + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection0', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10)", + ) + + azure_query( + node, + f"INSERT INTO TABLE FUNCTION azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection1', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') select number as x, 'str_' || toString(number) from numbers(10, 10)", + ) + + expected_desc_result = azure_query( + node, + f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'auto')", + ) + + desc_result = azure_query( + node, + f"desc azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}')", + ) + + assert expected_desc_result == desc_result + + expected_result = azure_query( + node, + f"select * from azureBlobStorage('{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', 'JSONEachRow', 'auto', 'x UInt32, y String') order by x", + ) + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}') order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', auto) order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', auto, auto) order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', 'x UInt32, y String') order by x", + ) + + assert result == expected_result + + result = azure_query( + node, + f"select * from azureBlobStorageCluster('simple_cluster', '{storage_account_url}', 'cont', 'test_format_detection*', '{account_name}', '{account_key}', auto, auto, 'x UInt32, y String') order by x", + ) + + assert result == expected_result diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 8ed1e4b6c0e..8dee15f4d94 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -1047,6 +1047,74 @@ def test_union_schema_inference_mode(started_cluster): assert "Cannot extract table structure" in error +def test_format_detection(started_cluster): + node = started_cluster.instances["node1"] + + node.query( + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection0', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(0)" + ) + + node.query( + "insert into function hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow) select number as x, 'str_' || toString(number) as y from numbers(10)" + ) + + expected_desc_result = node.query( + "desc hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow)" + ) + + desc_result = node.query("desc hdfs('hdfs://hdfs1:9000/test_format_detection1')") + + assert expected_desc_result == desc_result + + expected_result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection1', JSONEachRow, 'x UInt64, y String') order by x, y" + ) + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection1') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection1', auto, 'x UInt64, y String') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" + ) + + assert expected_result == result + + node.query("system drop schema cache for hdfs") + + result = node.query( + "select * from hdfs('hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}') order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}', auto, auto) order by x, y" + ) + + assert expected_result == result + + result = node.query( + "select * from hdfsCluster(test_cluster_two_shards, 'hdfs://hdfs1:9000/test_format_detection{0,1}', auto, 'x UInt64, y String') order by x, y" + ) + + assert expected_result == result + + if __name__ == "__main__": cluster.start() input("Cluster created, press any key to destroy...") diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 2549cb0d473..365ade7da65 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -2194,3 +2194,57 @@ def test_union_schema_inference_mode(started_cluster): f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) assert "Cannot extract table structure" in error + + +def test_s3_format_detection(started_cluster): + bucket = started_cluster.minio_bucket + instance = started_cluster.instances["dummy"] + + instance.query( + f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection0', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(0) settings s3_truncate_on_insert=1" + ) + + instance.query( + f"insert into table function s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String') select number, 'str_' || toString(number) from numbers(5) settings s3_truncate_on_insert=1" + ) + + expected_result = instance.query( + f"select * from s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow', 'x UInt64, y String')" + ) + + expected_desc_result = instance.query( + f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', 'JSONEachRow')" + ) + + for engine in ["s3", "url"]: + desc_result = instance.query( + f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')" + ) + + assert desc_result == expected_desc_result + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1')" + ) + + assert result == expected_result + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection1', auto, 'x UInt64, y String')" + ) + + assert result == expected_result + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')" + ) + + assert result == expected_result + + instance.query(f"system drop schema cache for {engine}") + + result = instance.query( + f"select * from {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_format_detection{{0,1}}', auto, 'x UInt64, y String')" + ) + + assert result == expected_result diff --git a/tests/queries/0_stateless/02969_auto_format_detection.reference b/tests/queries/0_stateless/02969_auto_format_detection.reference new file mode 100644 index 00000000000..4b86be04996 --- /dev/null +++ b/tests/queries/0_stateless/02969_auto_format_detection.reference @@ -0,0 +1,123 @@ +Parquet +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +ORC +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +Arrow +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +ArrowStream +a Nullable(UInt64) +b Nullable(String) +c Array(Nullable(UInt64)) +d Tuple(\n a Nullable(UInt64),\n b Nullable(String)) +Avro +a Int64 +b String +c Array(Int64) +d Tuple(\n a Int64,\n b String) +Native +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +BSONEachRow +a Nullable(Int64) +b Nullable(String) +c Array(Nullable(Int64)) +d Tuple(\n a Nullable(Int64),\n b Nullable(String)) +JSONCompact +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +Values +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) +TSKV +a Nullable(String) +b Nullable(String) +c Array(Nullable(UInt64)) +d Nullable(String) +JSONObjectEachRow +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONColumns +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONCompactColumns +c1 Nullable(String) +c2 Nullable(String) +c3 Array(Nullable(String)) +c4 Tuple(\n a Nullable(String),\n b Nullable(String)) +JSONCompact +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +JSON +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +TSV +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Tuple(Nullable(UInt64), Nullable(String)) +CSV +c1 Nullable(UInt64) +c2 Nullable(String) +c3 Array(Nullable(UInt64)) +c4 Nullable(UInt64) +c5 Nullable(String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a UInt64 +b String +c Array(UInt64) +d Tuple(\n a UInt64,\n b String) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +1 +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) +a Nullable(String) +b Nullable(String) +c Array(Nullable(String)) +d Tuple(\n a Nullable(String),\n b Nullable(String)) diff --git a/tests/queries/0_stateless/02969_auto_format_detection.sh b/tests/queries/0_stateless/02969_auto_format_detection.sh new file mode 100755 index 00000000000..5b9b4e09efa --- /dev/null +++ b/tests/queries/0_stateless/02969_auto_format_detection.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) +# shellcheck source=../shell_config.sh +. "$CURDIR"/../shell_config.sh + +DATA_FILE=$CLICKHOUSE_TEST_UNIQUE_NAME.data + +for format in Parquet ORC Arrow ArrowStream Avro Native BSONEachRow JSONCompact Values TSKV JSONObjectEachRow JSONColumns JSONCompactColumns JSONCompact JSON TSV CSV +do + echo $format + $CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format $format" > $DATA_FILE + $CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE')" +done + +rm $DATA_FILE + +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE.jsonl +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE*')" + + +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE + +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE', auto, 'a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)')" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE'); +desc file('$DATA_FILE'); +" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE', JSONEachRow); +desc file('$DATA_FILE'); +" + +touch $DATA_FILE.1 +$CLICKHOUSE_LOCAL -q "select * from generateRandom('a UInt64, b String, c Array(UInt64), d Tuple(a UInt64, b String)', 42) limit 10 format JSONEachRow" > $DATA_FILE.2 +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE.{1,2}')" +$CLICKHOUSE_LOCAL -q "desc file('$DATA_FILE.{1,2}') settings schema_inference_mode='union'" 2>&1 | grep -c "CANNOT_DETECT_FORMAT" + +$CLICKHOUSE_LOCAL -nmq " +desc file('$DATA_FILE.2'); +desc file('$DATA_FILE.{1,2}'); +" + +rm $DATA_FILE* From f05174e441f8efaa732f9e717f46259a9a8e479b Mon Sep 17 00:00:00 2001 From: avogar Date: Mon, 22 Jan 2024 23:28:17 +0000 Subject: [PATCH 125/884] Fix style --- docs/en/interfaces/schema-inference.md | 2 +- src/Formats/ReadSchemaUtils.cpp | 2 +- src/Storages/IStorageCluster.h | 2 +- src/Storages/StorageAzureBlob.cpp | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index d255688da1f..3d3ee5c83d6 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -1988,7 +1988,7 @@ Note: - If you have a lot of files, reading schema from all of them can take a lot of time. -## Automatic format detection {#autimatic-format-detection} +## Automatic format detection {#automatic-format-detection} If data format is not specified and cannot be determined by the file extension, ClickHouse will try to detect the file format by its content. diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index b4fba7b9ce6..c882f15b4b0 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -379,7 +379,7 @@ try /// We choose the format with larger number of columns in inferred schema. size_t max_number_of_columns = 0; - for (const auto & [format_to_detect, schema] : format_to_schema ) + for (const auto & [format_to_detect, schema] : format_to_schema) { if (schema.size() > max_number_of_columns) { diff --git a/src/Storages/IStorageCluster.h b/src/Storages/IStorageCluster.h index 28ebda5125e..92d9a84b758 100644 --- a/src/Storages/IStorageCluster.h +++ b/src/Storages/IStorageCluster.h @@ -41,7 +41,7 @@ public: protected: virtual void updateBeforeRead(const ContextPtr &) {} - virtual void updateQueryToSendIfNeeded(ASTPtr & /*query*/, const StorageSnapshotPtr & /*storage_snapshot*/, const ContextPtr & /*context*/) {} + virtual void updateQueryToSendIfNeeded(ASTPtr & /*query*/, const StorageSnapshotPtr & /*storage_snapshot*/, const ContextPtr & /*context*/) {} private: Poco::Logger * log; diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 888d360aff1..35072dc5cae 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -1335,7 +1335,7 @@ namespace "in AzureBlobStorage. You can specify table structure manually", *format); throw Exception( - ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, + ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files, because there are no files with provided path " "in AzureBlobStorage. You can specify table structure manually"); } @@ -1407,7 +1407,7 @@ namespace { format = format_name; } - + String getLastFileName() const override { return current_path_with_metadata.relative_path; } bool supportsLastReadBufferRecreation() const override { return true; } From 1bff525666b96e1a433d07e933e76a845e503dcb Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 Jan 2024 09:35:41 +0000 Subject: [PATCH 126/884] Fix tests and docs --- docs/en/interfaces/schema-inference.md | 2 +- src/Client/ClientBase.cpp | 6 +++--- tests/queries/0_stateless/02969_auto_format_detection.sh | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/en/interfaces/schema-inference.md b/docs/en/interfaces/schema-inference.md index 3d3ee5c83d6..6a7b744dd43 100644 --- a/docs/en/interfaces/schema-inference.md +++ b/docs/en/interfaces/schema-inference.md @@ -55,7 +55,7 @@ DESCRIBE file('hobbies.jsonl') └─────────┴─────────────────────────┴──────────────┴────────────────────┴─────────┴──────────────────┴────────────────┘ ``` -## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md), [azureBlobStorage](./engines/table-engines/integrations/azureBlobStorage.md) +## Table engines [File](../engines/table-engines/special/file.md), [S3](../engines/table-engines/integrations/s3.md), [URL](../engines/table-engines/special/url.md), [HDFS](../engines/table-engines/integrations/hdfs.md), [azureBlobStorage](../engines/table-engines/integrations/azureBlobStorage.md) If the list of columns is not specified in `CREATE TABLE` query, the structure of the table will be inferred automatically from the data. diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 01eff0d3e4c..6c39c3ebc95 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -632,9 +632,9 @@ try } else if (query_with_output->out_file) { - const auto & format_name = FormatFactory::instance().getFormatFromFileName(out_file); - if (!format_name.empty()) - current_format = format_name; + auto format_name = FormatFactory::instance().tryGetFormatFromFileName(out_file); + if (format_name) + current_format = *format_name; } } diff --git a/tests/queries/0_stateless/02969_auto_format_detection.sh b/tests/queries/0_stateless/02969_auto_format_detection.sh index 5b9b4e09efa..88d6575e499 100755 --- a/tests/queries/0_stateless/02969_auto_format_detection.sh +++ b/tests/queries/0_stateless/02969_auto_format_detection.sh @@ -1,4 +1,5 @@ #!/usr/bin/env bash +# Tags: no-fasttest CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh From 48609d3c9f55b00cd787a5592c8f51a96ac9ad42 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 Jan 2024 09:36:01 +0000 Subject: [PATCH 127/884] Fix tests --- programs/local/LocalServer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index dd96532aadd..2caf3a559a9 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -336,23 +336,23 @@ std::string LocalServer::getInitialCreateTableQuery() auto table_structure = config().getString("table-structure", "auto"); String table_file; - String format_from_file_name; + std::optional format_from_file_name; if (!config().has("table-file") || config().getString("table-file") == "-") { /// Use Unix tools stdin naming convention table_file = "stdin"; - format_from_file_name = FormatFactory::instance().getFormatFromFileDescriptor(STDIN_FILENO); + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileDescriptor(STDIN_FILENO); } else { /// Use regular file auto file_name = config().getString("table-file"); table_file = quoteString(file_name); - format_from_file_name = FormatFactory::instance().getFormatFromFileName(file_name); + format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName(file_name); } auto data_format = backQuoteIfNeed( - config().getString("table-data-format", config().getString("format", format_from_file_name.empty() ? "TSV" : format_from_file_name))); + config().getString("table-data-format", config().getString("format", format_from_file_name ? *format_from_file_name : "TSV"))); if (table_structure == "auto") From eaca40c53efb4c08878165219242bc22a004371f Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 Jan 2024 10:28:57 +0000 Subject: [PATCH 128/884] Update tests --- src/Formats/ReadSchemaUtils.cpp | 8 ++++---- tests/queries/0_stateless/01030_storage_url_syntax.sql | 4 ++-- tests/queries/0_stateless/02424_pod_array_overflow.sql | 2 +- tests/queries/0_stateless/02426_pod_array_overflow_2.sql | 2 +- tests/queries/0_stateless/02426_pod_array_overflow_3.sql | 2 +- .../queries/0_stateless/02497_schema_inference_nulls.sql | 4 ++-- .../0_stateless/02502_bad_values_schema_inference.sql | 2 +- .../02783_max_bytes_to_read_in_schema_inference.sql | 2 +- 8 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index c882f15b4b0..08e05872c97 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -193,13 +193,13 @@ try if (format_name) throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, - "The table structure cannot be extracted from a {} format file:\n{}\nYou can specify the structure manually", + "The table structure cannot be extracted from a {} format file:\n{}\n.You can specify the structure manually", *format_name, exception_message); throw Exception( ErrorCodes::CANNOT_DETECT_FORMAT, - "The data format cannot be detected by the contents of the files:\n{}\nYou can specify the format manually", + "The data format cannot be detected by the contents of the files:\n{}\n.You can specify the format manually", exception_message); } @@ -274,7 +274,7 @@ try throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "The table structure cannot be extracted from a {} format file. " - "Error: {}. You can specify the structure manually", + "Error:\n{}.\nYou can specify the structure manually", *format_name, exception_message); } @@ -473,7 +473,7 @@ try throw Exception( ErrorCodes::CANNOT_EXTRACT_TABLE_STRUCTURE, "The table structure cannot be extracted from a {} format file. " - "Error: {}. You can specify the structure manually", + "Error:\n{}.\nYou can specify the structure manually", *format_name, exception_messages); } diff --git a/tests/queries/0_stateless/01030_storage_url_syntax.sql b/tests/queries/0_stateless/01030_storage_url_syntax.sql index 9b31558eece..eda108aca2f 100644 --- a/tests/queries/0_stateless/01030_storage_url_syntax.sql +++ b/tests/queries/0_stateless/01030_storage_url_syntax.sql @@ -1,7 +1,7 @@ drop table if exists test_table_url_syntax ; create table test_table_url_syntax (id UInt32) ENGINE = URL('') -; -- { serverError 36 } +; -- { serverError UNSUPPORTED_URI_SCHEME } create table test_table_url_syntax (id UInt32) ENGINE = URL('','','','') ; -- { serverError 42 } drop table if exists test_table_url_syntax @@ -11,7 +11,7 @@ drop table if exists test_table_url ; create table test_table_url(id UInt32) ENGINE = URL('http://localhost/endpoint') -; -- { serverError 36 } +; -- { serverError CANNOT_DETECT_FORMAT } create table test_table_url(id UInt32) ENGINE = URL('http://localhost/endpoint.json'); drop table test_table_url; diff --git a/tests/queries/0_stateless/02424_pod_array_overflow.sql b/tests/queries/0_stateless/02424_pod_array_overflow.sql index 4b85d5be029..50c46cf19f1 100644 --- a/tests/queries/0_stateless/02424_pod_array_overflow.sql +++ b/tests/queries/0_stateless/02424_pod_array_overflow.sql @@ -1 +1 @@ -SELECT * FROM format(Native, '\x02\x02\x02\x6b\x30\x1a\x4d\x61\x70\x28\x46\x69\x78\x65\x64\x53\x74\x72\x69\x6e\x67\x28\x31\x29\x2c\x20\x49\x6e\x74\x36\x34\x29\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x7f\x00\x7f\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x64\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xcf\x31\x3f\x56\x69\x11\x89\x25'); -- { serverError 128 } +SELECT * FROM format(Native, '\x02\x02\x02\x6b\x30\x1a\x4d\x61\x70\x28\x46\x69\x78\x65\x64\x53\x74\x72\x69\x6e\x67\x28\x31\x29\x2c\x20\x49\x6e\x74\x36\x34\x29\x01\x00\x00\x00\x00\x00\x00\x00\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\xff\x7f\x00\x7f\xff\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x64\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xcf\x31\x3f\x56\x69\x11\x89\x25'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02426_pod_array_overflow_2.sql b/tests/queries/0_stateless/02426_pod_array_overflow_2.sql index 52a00730227..6a0d97acee3 100644 --- a/tests/queries/0_stateless/02426_pod_array_overflow_2.sql +++ b/tests/queries/0_stateless/02426_pod_array_overflow_2.sql @@ -1 +1 @@ -SELECT * FROM format(Native, 'k0\x23Array(Tuple(FixedString(1), Int64))\0\0\0\0\0\0\0�����\0����������������\0�\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0d\0\0\0\0\0\0\0\0\0\0\0\0\0�1?Vi�%'); -- { serverError 128 } +SELECT * FROM format(Native, 'k0\x23Array(Tuple(FixedString(1), Int64))\0\0\0\0\0\0\0�����\0����������������\0�\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0d\0\0\0\0\0\0\0\0\0\0\0\0\0�1?Vi�%'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02426_pod_array_overflow_3.sql b/tests/queries/0_stateless/02426_pod_array_overflow_3.sql index 857ba2ca28e..caabf7d1679 100644 --- a/tests/queries/0_stateless/02426_pod_array_overflow_3.sql +++ b/tests/queries/0_stateless/02426_pod_array_overflow_3.sql @@ -1 +1 @@ -SELECT * FROM format(Native, '\x01\x01\x01x\x0CArray(UInt8)\x01\x00\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF'); -- { serverError 128 } +SELECT * FROM format(Native, '\x01\x01\x01x\x0CArray(UInt8)\x01\x00\xBD\xEF\xBF\xBD\xEF\xBF\xBD\xEF'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02497_schema_inference_nulls.sql b/tests/queries/0_stateless/02497_schema_inference_nulls.sql index a25060e8182..b78b5709dbb 100644 --- a/tests/queries/0_stateless/02497_schema_inference_nulls.sql +++ b/tests/queries/0_stateless/02497_schema_inference_nulls.sql @@ -4,7 +4,7 @@ set input_format_json_try_infer_named_tuples_from_objects=0; set input_format_json_read_objects_as_strings=0; set input_format_json_infer_incomplete_types_as_strings=0; set input_format_json_read_numbers_as_strings=0; -desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH } +desc format(JSONEachRow, '{"x" : 1234}, {"x" : "String"}') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } desc format(JSONEachRow, '{"x" : [null, 1]}'); desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : []}'); desc format(JSONEachRow, '{"x" : [null, 1]}, {"x" : [null]}'); @@ -26,7 +26,7 @@ desc format(JSONEachRow, '{"x" : [1, 2]}, {"x" : [null]}'); select 'JSONCompactEachRow'; set schema_inference_make_columns_nullable=1; -desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError TYPE_MISMATCH } +desc format(JSONCompactEachRow, '[1234], ["String"]') settings input_format_json_try_infer_numbers_from_strings=1; -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } desc format(JSONCompactEachRow, '[[null, 1]]'); desc format(JSONCompactEachRow, '[[null, 1]], [[]]'); desc format(JSONCompactEachRow, '[[null, 1]], [[null]]'); diff --git a/tests/queries/0_stateless/02502_bad_values_schema_inference.sql b/tests/queries/0_stateless/02502_bad_values_schema_inference.sql index 4c796842c0d..67ac09832de 100644 --- a/tests/queries/0_stateless/02502_bad_values_schema_inference.sql +++ b/tests/queries/0_stateless/02502_bad_values_schema_inference.sql @@ -1,2 +1,2 @@ -desc format(Values, '(\'abc)'); -- { serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED } +desc format(Values, '(\'abc)'); -- { serverError CANNOT_EXTRACT_TABLE_STRUCTURE } diff --git a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql index b4165e8e80a..ef0381df1a6 100644 --- a/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql +++ b/tests/queries/0_stateless/02783_max_bytes_to_read_in_schema_inference.sql @@ -1,5 +1,5 @@ set input_format_max_rows_to_read_for_schema_inference=2; set input_format_json_infer_incomplete_types_as_strings=0; -desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=10; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} desc format('JSONEachRow', '{"a" : null}, {"a" : 42}') settings input_format_max_bytes_to_read_for_schema_inference=20; From 5e4796ae161e0546845f2dca167671eadf9463c3 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 Jan 2024 12:46:07 +0000 Subject: [PATCH 129/884] Fix heap-use-after-free --- src/Formats/ReadSchemaUtils.cpp | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 08e05872c97..4c734130622 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -349,31 +349,35 @@ try if (!format_name) { std::unordered_map format_to_schema; - for (const auto & format_to_detect : getSimilarFormatsSetForDetection()) + const auto & formats_set_to_detect = getSimilarFormatsSetForDetection(); + for (size_t i = 0; i != formats_set_to_detect.size(); ++i) { try { schema_reader = FormatFactory::instance().getSchemaReader( - format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + formats_set_to_detect[i], support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); auto tmp_names_and_types = schema_reader->readSchema(); /// If schema was inferred successfully for this format, remember it and try next format. if (!tmp_names_and_types.empty()) - format_to_schema[format_to_detect] = tmp_names_and_types; + format_to_schema[formats_set_to_detect[i]] = tmp_names_and_types; } catch (...) // NOLINT(bugprone-empty-catch) { /// Try next format. } - if (support_buf_recreation) + if (i != formats_set_to_detect.size() - 1) { - read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); - iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); - } - else - { - peekable_buf->rollbackToCheckpoint(); + if (support_buf_recreation) + { + read_buffer_iterator.setPreviousReadBuffer(std::move(iterator_data.buf)); + iterator_data.buf = read_buffer_iterator.recreateLastReadBuffer(); + } + else + { + peekable_buf->rollbackToCheckpoint(); + } } } From 93fbe1d9c8dc1af75094e81fc25e20ee1241bab3 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 23 Jan 2024 16:17:16 +0000 Subject: [PATCH 130/884] Fixes --- src/Formats/FormatFactory.cpp | 2 +- src/Formats/ReadSchemaUtils.cpp | 16 ++++++++-------- .../0_stateless/02185_orc_corrupted_file.sh | 2 +- .../02245_parquet_skip_unknown_type.sh | 2 +- ..._json_wrong_root_type_in_schema_inference.sql | 4 ++-- .../0_stateless/02286_mysql_dump_input_format.sh | 2 +- .../0_stateless/02293_formats_json_columns.sh | 2 +- .../02327_capnproto_protobuf_empty_messages.sh | 8 ++++---- .../0_stateless/02416_json_object_inference.sql | 2 +- ...uplicate_column_names_in_schema_inference.sql | 8 ++++---- .../02458_use_structure_from_insertion_table.sql | 10 +++++----- tests/queries/0_stateless/02724_database_s3.sh | 2 +- tests/queries/0_stateless/02725_database_hdfs.sh | 2 +- .../02900_union_schema_inference_mode.sh | 4 ++-- 14 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index cacb5a510da..b6f8f041d8d 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -673,7 +673,7 @@ std::optional FormatFactory::tryGetFormatFromFileDescriptor(int fd) #elif defined(OS_DARWIN) char file_path[PATH_MAX] = {'\0'}; if (fcntl(fd, F_GETPATH, file_path) != -1) - return tryGetFormatFromFileName(file_path, false); + return tryGetFormatFromFileName(file_path); return std::nullopt; #else (void)fd; diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 4c734130622..5576da56dbf 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -127,7 +127,6 @@ try IReadBufferIterator::Data iterator_data; std::vector> schemas_for_union_mode; std::string exception_messages; - SchemaReaderPtr schema_reader; size_t max_rows_to_read = format_settings ? format_settings->max_rows_to_read_for_schema_inference : context->getSettingsRef().input_format_max_rows_to_read_for_schema_inference; size_t max_bytes_to_read = format_settings ? format_settings->max_bytes_to_read_for_schema_inference @@ -227,6 +226,8 @@ try continue; } + SchemaReaderPtr schema_reader; + if (format_name) { try @@ -417,12 +418,11 @@ try if (!format_name) throw Exception(ErrorCodes::CANNOT_DETECT_FORMAT, "The data format cannot be detected by the contents of the files. You can specify the format manually"); - /// If we got all schemas from cache, schema_reader can be uninitialized. - /// But we still need some stateless methods of ISchemaReader, - /// let's initialize it with empty buffer. + /// We need some stateless methods of ISchemaReader, but during reading schema we + /// could not even create a schema reader (for example when we got schema from cache). + /// Let's create stateless schema reader from empty read buffer. EmptyReadBuffer empty; - if (!schema_reader) - schema_reader = FormatFactory::instance().getSchemaReader(*format_name, empty, context, format_settings); + SchemaReaderPtr stateless_schema_reader = FormatFactory::instance().getSchemaReader(*format_name, empty, context, format_settings); if (mode == SchemaInferenceMode::UNION) { @@ -449,7 +449,7 @@ try /// If types are not the same, try to transform them according /// to the format to find common type. auto new_type_copy = type; - schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy); + stateless_schema_reader->transformTypesFromDifferentFilesIfNeeded(it->second, new_type_copy); /// If types are not the same after transform, we cannot do anything, throw an exception. if (!it->second->equals(*new_type_copy)) @@ -495,7 +495,7 @@ try /// It will allow to execute simple data loading with query /// "INSERT INTO table SELECT * FROM ..." const auto & insertion_table = context->getInsertionTable(); - if (schema_reader && !schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) + if (!stateless_schema_reader->hasStrictOrderOfColumns() && !insertion_table.empty()) { auto storage = DatabaseCatalog::instance().getTable(insertion_table, context); auto metadata = storage->getInMemoryMetadataPtr(); diff --git a/tests/queries/0_stateless/02185_orc_corrupted_file.sh b/tests/queries/0_stateless/02185_orc_corrupted_file.sh index 1987f094faa..12510ae3836 100755 --- a/tests/queries/0_stateless/02185_orc_corrupted_file.sh +++ b/tests/queries/0_stateless/02185_orc_corrupted_file.sh @@ -8,4 +8,4 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) USER_FILES_PATH=$($CLICKHOUSE_CLIENT_BINARY --query "select _path,_file from file('nonexist.txt', 'CSV', 'val1 char')" 2>&1 | grep Exception | awk '{gsub("/nonexist.txt","",$9); print $9}') cp $CUR_DIR/data_orc/corrupted.orc $USER_FILES_PATH/ -${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +${CLICKHOUSE_CLIENT} --query="select * from file('corrupted.orc')" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh index 954e2e83f27..8ff6e28b123 100755 --- a/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh +++ b/tests/queries/0_stateless/02245_parquet_skip_unknown_type.sh @@ -12,6 +12,6 @@ DATA_FILE=$USER_FILES_PATH/$FILE_NAME cp $CUR_DIR/data_parquet_bad_column/metadata_0.parquet $DATA_FILE -$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "Cannot extract table structure" && echo "OK" || echo "FAIL" +$CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet)" 2>&1 | grep -qF "CANNOT_EXTRACT_TABLE_STRUCTURE" && echo "OK" || echo "FAIL" $CLICKHOUSE_CLIENT -q "desc file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" $CLICKHOUSE_CLIENT -q "select count(*) from file(test_02245.parquet) settings input_format_parquet_skip_columns_with_unsupported_types_in_schema_inference=1" diff --git a/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql b/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql index 5462d38f1a3..98bf29c32f5 100644 --- a/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql +++ b/tests/queries/0_stateless/02268_json_wrong_root_type_in_schema_inference.sql @@ -1,7 +1,7 @@ -- Tags: no-fasttest insert into function file('02268_data.jsonl', 'TSV') select 1; -select * from file('02268_data.jsonl'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from file('02268_data.jsonl'); --{serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into function file('02268_data.jsonCompactEachRow', 'TSV') select 1; -select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_PARSE_INPUT_ASSERTION_FAILED} +select * from file('02268_data.jsonCompactEachRow'); --{serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02286_mysql_dump_input_format.sh b/tests/queries/0_stateless/02286_mysql_dump_input_format.sh index a3711497ae8..2f6167c3ddf 100755 --- a/tests/queries/0_stateless/02286_mysql_dump_input_format.sh +++ b/tests/queries/0_stateless/02286_mysql_dump_input_format.sh @@ -23,7 +23,7 @@ $CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mys $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test', max_threads=1" $CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test2'" $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test2', max_threads=1" -$CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "desc file(dump1.sql, MySQLDump) settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' $CLICKHOUSE_CLIENT -q "select * from file(dump1.sql, MySQLDump, 'x Nullable(Int32)') settings input_format_mysql_dump_table_name='test 3'" 2>&1 | grep -F -q 'EMPTY_DATA_PASSED' && echo 'OK' || echo 'FAIL' echo "dump2" diff --git a/tests/queries/0_stateless/02293_formats_json_columns.sh b/tests/queries/0_stateless/02293_formats_json_columns.sh index ce35c4bd878..4eae5a1abb4 100755 --- a/tests/queries/0_stateless/02293_formats_json_columns.sh +++ b/tests/queries/0_stateless/02293_formats_json_columns.sh @@ -88,4 +88,4 @@ echo ' } ' > $DATA_FILE -$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3, input_format_json_infer_incomplete_types_as_strings=0" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL' +$CLICKHOUSE_CLIENT -q "desc file(data_02293, JSONColumns) settings input_format_max_rows_to_read_for_schema_inference=3, input_format_json_infer_incomplete_types_as_strings=0" 2>&1 | grep -F -q 'CANNOT_EXTRACT_TABLE_STRUCTURE' && echo 'OK' || echo 'FAIL' diff --git a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh index dfc0dedeaf1..650faf6985e 100755 --- a/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh +++ b/tests/queries/0_stateless/02327_capnproto_protobuf_empty_messages.sh @@ -15,11 +15,11 @@ mkdir -p $SCHEMADIR/$SERVER_SCHEMADIR cp -r $CLIENT_SCHEMADIR/02327_* $SCHEMADIR/$SERVER_SCHEMADIR/ -$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; -$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'Cannot extract table structure' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_protobuf engine=File(Protobuf) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; +$CLICKHOUSE_CLIENT --query="create table test_capnp engine=File(CapnProto) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty'" 2>&1 | grep -F -q 'The table structure cannot be extracted' && echo 'OK' || echo 'FAIL'; $CLICKHOUSE_CLIENT --query="desc file(data.pb) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_protobuf_skip_fields_with_unsupported_types_in_schema_inference=1"; $CLICKHOUSE_CLIENT --query="desc file(data.capnp) settings format_schema='$SERVER_SCHEMADIR/02327_schema:MessageWithEmpty', input_format_capn_proto_skip_fields_with_unsupported_types_in_schema_inference=1"; diff --git a/tests/queries/0_stateless/02416_json_object_inference.sql b/tests/queries/0_stateless/02416_json_object_inference.sql index 91137c0243c..3022ee026d0 100644 --- a/tests/queries/0_stateless/02416_json_object_inference.sql +++ b/tests/queries/0_stateless/02416_json_object_inference.sql @@ -2,5 +2,5 @@ set allow_experimental_object_type=1; desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); set allow_experimental_object_type=0, input_format_json_read_objects_as_strings=0, input_format_json_try_infer_named_tuples_from_objects=0, input_format_json_read_numbers_as_strings=0; -desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); -- {serverError 652} +desc format(JSONEachRow, '{"a" : {"b" : {"c" : 1, "d" : "str"}}}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql b/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql index 626a4d7034e..f67e5496a98 100644 --- a/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql +++ b/tests/queries/0_stateless/02455_duplicate_column_names_in_schema_inference.sql @@ -1,7 +1,7 @@ -- Tags: no-fasttest -desc format(JSONEachRow, '{"x" : 1, "x" : 2}'); -- {serverError INCORRECT_DATA} -desc format(JSONEachRow, '{"x" : 1, "y" : 2}\n{"x" : 2, "x" : 3}'); -- {serverError INCORRECT_DATA} -desc format(CSVWithNames, 'a,b,a\n1,2,3'); -- {serverError INCORRECT_DATA} -desc format(CSV, '1,2,3') settings column_names_for_schema_inference='a, b, a'; -- {serverError INCORRECT_DATA} +desc format(JSONEachRow, '{"x" : 1, "x" : 2}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(JSONEachRow, '{"x" : 1, "y" : 2}\n{"x" : 2, "x" : 3}'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(CSVWithNames, 'a,b,a\n1,2,3'); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +desc format(CSV, '1,2,3') settings column_names_for_schema_inference='a, b, a'; -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql index ac549a7faf1..71a2381d7b6 100644 --- a/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql +++ b/tests/queries/0_stateless/02458_use_structure_from_insertion_table.sql @@ -10,14 +10,14 @@ set input_format_json_infer_incomplete_types_as_strings=0; insert into test select * from file(02458_data.jsonl); insert into test select x, 1 from file(02458_data.jsonl); insert into test select x, y from file(02458_data.jsonl); -insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x + 1, y from file(02458_data.jsonl); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into test select x, z from file(02458_data.jsonl); insert into test select * from file(02458_data.jsoncompacteachrow); -insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} -insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError ONLY_NULLS_WHILE_READING_SCHEMA} +insert into test select x, 1 from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, y from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x + 1, y from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} +insert into test select x, z from file(02458_data.jsoncompacteachrow); -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} insert into test select * from input() format CSV 1,2 insert into test select x, y from input() format CSV 1,2 -- {serverError CANNOT_EXTRACT_TABLE_STRUCTURE} diff --git a/tests/queries/0_stateless/02724_database_s3.sh b/tests/queries/0_stateless/02724_database_s3.sh index 13b627c0342..80b47282146 100755 --- a/tests/queries/0_stateless/02724_database_s3.sh +++ b/tests/queries/0_stateless/02724_database_s3.sh @@ -46,7 +46,7 @@ DROP DATABASE IF EXISTS test3; CREATE DATABASE test3 ENGINE = S3; USE test3; SELECT * FROM \"http://localhost:11111/test/a.myext\" -""" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +""" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "S3_ERROR" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --multiline --multiquery -q """ USE test3; diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index b4e081f6de0..71ccee6f5f4 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -58,7 +58,7 @@ SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "HDFS_ERROR" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: diff --git a/tests/queries/0_stateless/02900_union_schema_inference_mode.sh b/tests/queries/0_stateless/02900_union_schema_inference_mode.sh index dc0dd8ae1f4..a0fdb5276e0 100755 --- a/tests/queries/0_stateless/02900_union_schema_inference_mode.sh +++ b/tests/queries/0_stateless/02900_union_schema_inference_mode.sh @@ -39,13 +39,13 @@ desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/archive.tar :: data{1,2,3}.jsonl'); " echo 'Error' > $CLICKHOUSE_TEST_UNIQUE_NAME/data4.jsonl -$CLICKHOUSE_LOCAL -q "desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl') settings schema_inference_mode='union'" 2>&1 | grep -c -F "Cannot extract table structure" +$CLICKHOUSE_LOCAL -q "desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl') settings schema_inference_mode='union'" 2>&1 | grep -c -F "CANNOT_EXTRACT_TABLE_STRUCTURE" $CLICKHOUSE_LOCAL -nm -q " set schema_inference_mode = 'union'; desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{2,3}.jsonl'); desc file('$CLICKHOUSE_TEST_UNIQUE_NAME/data{1,2,3,4}.jsonl'); -" 2>&1 | grep -c -F "Cannot extract table structure" +" 2>&1 | grep -c -F "CANNOT_EXTRACT_TABLE_STRUCTURE" echo 42 > $CLICKHOUSE_TEST_UNIQUE_NAME/data1.csv echo 42, 43 > $CLICKHOUSE_TEST_UNIQUE_NAME/data2.csv From 849858017237d9752f3efb801bcc2267288cb8c8 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 24 Jan 2024 10:01:06 +0100 Subject: [PATCH 131/884] Fixing build --- src/Backups/BackupIO_AzureBlobStorage.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index fca324869ae..34be110cd42 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include @@ -143,7 +144,7 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage,configuration_.container, false, false} + , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); From 788eb487075fe770097759edfd46544134e11116 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 24 Jan 2024 11:51:02 +0100 Subject: [PATCH 132/884] Fix build after merging master --- src/Backups/BackupIO_AzureBlobStorage.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 34be110cd42..2c2396e9c0a 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -34,7 +34,7 @@ BackupReaderAzureBlobStorage::BackupReaderAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupReaderDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupReaderAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} + , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); @@ -99,7 +99,8 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, LOG_INFO(&Poco::Logger::get("BackupReaderAzureBlobStorage"), "Enter copyFileToDisk"); auto destination_data_source_description = destination_disk->getDataSourceDescription(); - if ((destination_data_source_description.type == DataSourceType::AzureBlobStorage) + if ((destination_data_source_description.type == DataSourceType::ObjectStorage) + && (destination_data_source_description.object_storage_type == ObjectStorageType::Azure) && (destination_data_source_description.is_encrypted == encrypted_in_backup)) { LOG_TRACE(log, "Copying {} from AzureBlobStorage to disk {}", path_in_backup, destination_disk->getName()); @@ -144,7 +145,7 @@ BackupWriterAzureBlobStorage::BackupWriterAzureBlobStorage( const WriteSettings & write_settings_, const ContextPtr & context_) : BackupWriterDefault(read_settings_, write_settings_, &Poco::Logger::get("BackupWriterAzureBlobStorage")) - , data_source_description{DataSourceType::AzureBlobStorage, configuration_.container, false, false} + , data_source_description{DataSourceType::ObjectStorage, ObjectStorageType::Azure, MetadataStorageType::None, configuration_.container, false, false} , configuration(configuration_) { auto client_ptr = StorageAzureBlob::createClient(configuration, /* is_read_only */ false); From 10aaf2cbe46c7a00f744ad2e6183c441db847587 Mon Sep 17 00:00:00 2001 From: Alex Cheng Date: Wed, 24 Jan 2024 21:10:48 +0800 Subject: [PATCH 133/884] fix the default number of async_insert_max_data_size --- docs/en/operations/settings/settings.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md index f085fe1abcd..4bef6f4a02d 100644 --- a/docs/en/operations/settings/settings.md +++ b/docs/en/operations/settings/settings.md @@ -1922,7 +1922,7 @@ Possible values: - Positive integer. - 0 — Asynchronous insertions are disabled. -Default value: `100000`. +Default value: `1000000`. ### async_insert_max_query_number {#async-insert-max-query-number} From e988f8a47142ab07228fbaee9acb4ba64f2644e1 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 24 Jan 2024 17:30:04 +0100 Subject: [PATCH 134/884] fix typo in formats.md --- docs/en/interfaces/formats.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index fd44fbf4462..a3f54c1c383 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -279,7 +279,7 @@ the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quo `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` -In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluste, or if the format is trivial then `format_schema_rows_template` can be used to pass the template string directly in the query, rather than a path to the file which contains it. +In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluster, or if the format is trivial then `format_schema_rows_template` can be used to pass the template string directly in the query, rather than a path to the file which contains it. The `format_template_rows_between_delimiter` setting specifies the delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) From 6a9e7abf05760ef0eb7f531970f9eb110e7b4ea8 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Wed, 24 Jan 2024 17:57:46 +0100 Subject: [PATCH 135/884] Update 00937_format_schema_rows_template.sh --- .../queries/0_stateless/00937_format_schema_rows_template.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh index 651e3618f83..3124cc3b52b 100755 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.sh +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -26,7 +26,8 @@ echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Like $CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ -format_template_rows_between_delimiter = ';\n'"; -- { serverError 474 } +format_template_rows_between_delimiter = ';\n'"; +-- { serverError 474 } $CLICKHOUSE_CLIENT --query="DROP TABLE template"; -rm "$CURDIR"/00937_template_output_format_row.tmp \ No newline at end of file +rm "$CURDIR"/00937_template_output_format_row.tmp From 11f1ea50d7182e3f9493e026b85cd91f6461aab4 Mon Sep 17 00:00:00 2001 From: avogar Date: Wed, 24 Jan 2024 17:55:31 +0000 Subject: [PATCH 136/884] Fix tests --- src/Formats/ReadSchemaUtils.cpp | 9 +-- src/Storages/HDFS/StorageHDFS.cpp | 2 +- src/Storages/StorageAzureBlob.cpp | 60 +++++++++++++++---- src/Storages/StorageS3.cpp | 33 +++++----- .../TableFunctionAzureBlobStorage.cpp | 2 +- tests/integration/test_storage_hdfs/test.py | 4 +- tests/integration/test_storage_s3/test.py | 6 +- .../0_stateless/02725_database_hdfs.sh | 2 +- 8 files changed, 79 insertions(+), 39 deletions(-) diff --git a/src/Formats/ReadSchemaUtils.cpp b/src/Formats/ReadSchemaUtils.cpp index 5576da56dbf..f97df25aba7 100644 --- a/src/Formats/ReadSchemaUtils.cpp +++ b/src/Formats/ReadSchemaUtils.cpp @@ -226,10 +226,12 @@ try continue; } - SchemaReaderPtr schema_reader; + std::unique_ptr peekable_buf; /// Can be used in format detection. Should be destroyed after schema reader. if (format_name) { + SchemaReaderPtr schema_reader; + try { schema_reader = FormatFactory::instance().getSchemaReader(*format_name, *iterator_data.buf, context, format_settings); @@ -296,7 +298,6 @@ try /// to high memory usage as it will save all the read data from the beginning of the file, /// especially it will be noticeable for formats like Parquet/ORC/Arrow that do seeks to the /// end of file. - std::unique_ptr peekable_buf; bool support_buf_recreation = read_buffer_iterator.supportsLastReadBufferRecreation(); if (!support_buf_recreation) { @@ -310,7 +311,7 @@ try { try { - schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader(format_to_detect, support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); names_and_types = schema_reader->readSchema(); if (names_and_types.empty()) @@ -355,7 +356,7 @@ try { try { - schema_reader = FormatFactory::instance().getSchemaReader( + SchemaReaderPtr schema_reader = FormatFactory::instance().getSchemaReader( formats_set_to_detect[i], support_buf_recreation ? *iterator_data.buf : *peekable_buf, context, format_settings); schema_reader->setMaxRowsAndBytesToRead(max_rows_to_read, max_bytes_to_read); auto tmp_names_and_types = schema_reader->readSchema(); diff --git a/src/Storages/HDFS/StorageHDFS.cpp b/src/Storages/HDFS/StorageHDFS.cpp index a846e9fd9ef..59eba6505f3 100644 --- a/src/Storages/HDFS/StorageHDFS.cpp +++ b/src/Storages/HDFS/StorageHDFS.cpp @@ -1147,7 +1147,7 @@ void registerStorageHDFS(StorageFactory & factory) } if (format_name == "auto") - format_name = FormatFactory::instance().getFormatFromFileName(url); + format_name = FormatFactory::instance().tryGetFormatFromFileName(url).value_or("auto"); String compression_method; if (engine_args.size() == 3) diff --git a/src/Storages/StorageAzureBlob.cpp b/src/Storages/StorageAzureBlob.cpp index 35072dc5cae..c55725ce940 100644 --- a/src/Storages/StorageAzureBlob.cpp +++ b/src/Storages/StorageAzureBlob.cpp @@ -144,7 +144,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } @@ -237,7 +237,7 @@ StorageAzureBlob::Configuration StorageAzureBlob::getConfiguration(ASTs & engine configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); return configuration; } @@ -1316,10 +1316,28 @@ namespace Data next() override { /// For default mode check cached columns for currently read keys on first iteration. - if (first && getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + if (first) { - if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) - return {nullptr, cached_columns, format}; + /// If format is unknown we iterate through all currently read keys on first iteration and + /// try to determine format by file name. + if (!format) + { + for (const auto & key : read_keys) + { + if (auto format_from_path = FormatFactory::instance().tryGetFormatFromFileName(key.relative_path)) + { + format = format_from_path; + break; + } + } + } + + /// For default mode check cached columns for currently read keys on first iteration. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + if (auto cached_columns = tryGetColumnsFromCache(read_keys.begin(), read_keys.end())) + return {nullptr, cached_columns, format}; + } } current_path_with_metadata = file_iterator->next(); @@ -1345,15 +1363,33 @@ namespace first = false; - /// AzureBlobStorage file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) + /// AzureBlobStorage file iterator could get new keys after new iteration. + if (read_keys.size() > prev_read_keys_size) { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// If format is unknown we can try to determine it by new file names. + if (!format) + { + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + { + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it).relative_path)) + { + format = format_from_file_name; + break; + } + } + } + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache, format}; } - else if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) + + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::UNION) { RelativePathsWithMetadata paths = {current_path_with_metadata}; if (auto columns_from_cache = tryGetColumnsFromCache(paths.begin(), paths.end())) @@ -1520,7 +1556,7 @@ ColumnsDescription StorageAzureBlob::getTableStructureFromData( const std::optional & format_settings, const DB::ContextPtr & ctx) { - return getTableStructureAndFormatFromDataImpl(std::nullopt, object_storage, configuration, format_settings, ctx).first; + return getTableStructureAndFormatFromDataImpl(configuration.format, object_storage, configuration, format_settings, ctx).first; } SchemaCache & StorageAzureBlob::getSchemaCache(const ContextPtr & ctx) diff --git a/src/Storages/StorageS3.cpp b/src/Storages/StorageS3.cpp index 8e5b6040a63..f9c7400edfb 100644 --- a/src/Storages/StorageS3.cpp +++ b/src/Storages/StorageS3.cpp @@ -1702,30 +1702,33 @@ namespace return {nullptr, std::nullopt, format}; } - /// S3 file iterator could get new keys after new iteration, if format is unknown we can try to determine it by new file names. - if (!format && read_keys.size() > prev_read_keys_size) + /// S3 file iterator could get new keys after new iteration + if (read_keys.size() > prev_read_keys_size) { - for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) + /// If format is unknown we can try to determine it by new file names. + if (!format) { - if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + for (auto it = read_keys.begin() + prev_read_keys_size; it != read_keys.end(); ++it) { - format = format_from_file_name; - break; + if (auto format_from_file_name = FormatFactory::instance().tryGetFormatFromFileName((*it)->key)) + { + format = format_from_file_name; + break; + } } } - } - /// S3 file iterator could get new keys after new iteration, check them in schema cache if schema inference mode is default. - if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT && read_keys.size() > prev_read_keys_size) - { - auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + /// Check new files in schema cache if schema inference mode is default. + if (getContext()->getSettingsRef().schema_inference_mode == SchemaInferenceMode::DEFAULT) + { + auto columns_from_cache = tryGetColumnsFromCache(read_keys.begin() + prev_read_keys_size, read_keys.end()); + if (columns_from_cache) + return {nullptr, columns_from_cache, format}; + } + prev_read_keys_size = read_keys.size(); - if (columns_from_cache) - return {nullptr, columns_from_cache, format}; } - prev_read_keys_size = read_keys.size(); - if (getContext()->getSettingsRef().s3_skip_empty_files && current_key_with_info->info && current_key_with_info->info->size == 0) continue; diff --git a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp index b9e0af53b7b..8a537e154db 100644 --- a/src/TableFunctions/TableFunctionAzureBlobStorage.cpp +++ b/src/TableFunctions/TableFunctionAzureBlobStorage.cpp @@ -58,7 +58,7 @@ void TableFunctionAzureBlobStorage::parseArgumentsImpl(ASTs & engine_args, const configuration.blobs_paths = {configuration.blob_path}; if (configuration.format == "auto") - configuration.format = FormatFactory::instance().getFormatFromFileName(configuration.blob_path); + configuration.format = FormatFactory::instance().tryGetFormatFromFileName(configuration.blob_path).value_or("auto"); } else { diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 8dee15f4d94..165dfb212b7 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -600,7 +600,7 @@ def test_schema_inference_with_globs(started_cluster): ) assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result + "CANNOT_EXTRACT_TABLE_STRUCTURE" in result ) @@ -1044,7 +1044,7 @@ def test_union_schema_inference_mode(started_cluster): error = node.query_and_get_error( "desc hdfs('hdfs://hdfs1:9000/test_union_schema_inference*.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error def test_format_detection(started_cluster): diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 365ade7da65..0b5e9462860 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1380,7 +1380,7 @@ def test_schema_inference_from_globs(started_cluster): ) assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result + "CANNOT_EXTRACT_TABLE_STRUCTURE" in result ) url_filename = "test{0,1,2,3}.jsoncompacteachrow" @@ -1390,7 +1390,7 @@ def test_schema_inference_from_globs(started_cluster): ) assert ( - "Cannot extract table structure from JSONCompactEachRow format file" in result + "CANNOT_EXTRACT_TABLE_STRUCTURE" in result ) @@ -2193,7 +2193,7 @@ def test_union_schema_inference_mode(started_cluster): error = instance.query_and_get_error( f"desc {engine}('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test_union_schema_inference{{1,2,3,4}}.jsonl') settings schema_inference_mode='union', describe_compact_output=1 format TSV" ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error def test_s3_format_detection(started_cluster): diff --git a/tests/queries/0_stateless/02725_database_hdfs.sh b/tests/queries/0_stateless/02725_database_hdfs.sh index 71ccee6f5f4..d62f928e947 100755 --- a/tests/queries/0_stateless/02725_database_hdfs.sh +++ b/tests/queries/0_stateless/02725_database_hdfs.sh @@ -58,7 +58,7 @@ SELECT * FROM \"abacaba/file.tsv\" """ 2>&1 | tr '\n' ' ' | grep -oF "CANNOT_EXTRACT_TABLE_STRUCTURE" ${CLICKHOUSE_CLIENT} -q "SELECT * FROM test_hdfs_4.\`http://localhost:11111/test/a.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: -${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "HDFS_ERROR" > /dev/null && echo "OK" || echo 'FAIL' ||: +${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/file.myext\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222/test_02725_3.tsv\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "CANNOT_EXTRACT_TABLE_STRUCTURE" > /dev/null && echo "OK" || echo 'FAIL' ||: ${CLICKHOUSE_CLIENT} --query "SELECT * FROM test_hdfs_4.\`hdfs://localhost:12222\`" 2>&1 | tr '\n' ' ' | grep -oF -e "UNKNOWN_TABLE" -e "BAD_ARGUMENTS" > /dev/null && echo "OK" || echo 'FAIL' ||: From f551081dd4c38ac014f554c7ee4efc4e18777f9a Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Wed, 24 Jan 2024 21:10:50 +0100 Subject: [PATCH 137/884] Addressed review comments --- src/Backups/BackupIO_AzureBlobStorage.cpp | 7 ++--- .../copyAzureBlobStorageFile.cpp | 27 ++++++++----------- .../copyAzureBlobStorageFile.h | 8 +++--- 3 files changed, 16 insertions(+), 26 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 2c2396e9c0a..1b4c10ad0cb 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -104,7 +104,7 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, && (destination_data_source_description.is_encrypted == encrypted_in_backup)) { LOG_TRACE(log, "Copying {} from AzureBlobStorage to disk {}", path_in_backup, destination_disk->getName()); - auto write_blob_function = [&](const Strings & blob_path, WriteMode mode, const std::optional & object_attributes) -> size_t + auto write_blob_function = [&](const Strings & blob_path, WriteMode mode, const std::optional &) -> size_t { /// Object storage always uses mode `Rewrite` because it simulates append using metadata and different files. if (blob_path.size() != 2 || mode != WriteMode::Rewrite) @@ -123,7 +123,6 @@ void BackupReaderAzureBlobStorage::copyFileToDisk(const String & path_in_backup, /* dest_path */ blob_path[0], settings, read_settings, - object_attributes, threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupRDAzure"), /* for_disk_azure_blob_storage= */ true); @@ -180,7 +179,6 @@ void BackupWriterAzureBlobStorage::copyFileFromDisk(const String & path_in_backu fs::path(configuration.blob_path) / path_in_backup, settings, read_settings, - {}, threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); return; /// copied! } @@ -204,14 +202,13 @@ void BackupWriterAzureBlobStorage::copyFile(const String & destination, const St /* dest_path */ destination, settings, read_settings, - {}, threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure"), /* for_disk_azure_blob_storage= */ true); } void BackupWriterAzureBlobStorage::copyDataToFile(const String & path_in_backup, const CreateReadBufferFunction & create_read_buffer, UInt64 start_pos, UInt64 length) { - copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, {}, + copyDataToAzureBlobStorageFile(create_read_buffer, start_pos, length, client, configuration.container, path_in_backup, settings, threadPoolCallbackRunner(getBackupsIOThreadPool().get(), "BackupWRAzure")); } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index bb8702e9b41..350d2d1d34e 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -47,10 +47,9 @@ namespace MultiVersion & client_, size_t offset_, size_t total_size_, - const String & dest_container_, + const String & dest_container_for_logging_, const String & dest_blob_, MultiVersion settings_, - const std::optional> & object_metadata_, ThreadPoolCallbackRunner schedule_, bool for_disk_azure_blob_storage_, const Poco::Logger * log_) @@ -58,10 +57,9 @@ namespace , client(client_) , offset (offset_) , total_size (total_size_) - , dest_container(dest_container_) + , dest_container_for_logging(dest_container_for_logging_) , dest_blob(dest_blob_) , settings(settings_) - , object_metadata(object_metadata_) , schedule(schedule_) , for_disk_azure_blob_storage(for_disk_azure_blob_storage_) , log(log_) @@ -76,10 +74,9 @@ namespace MultiVersion & client; size_t offset; size_t total_size; - const String & dest_container; + const String & dest_container_for_logging; const String & dest_blob; MultiVersion settings; - const std::optional> & object_metadata; ThreadPoolCallbackRunner schedule; bool for_disk_azure_blob_storage; const Poco::Logger * log; @@ -208,7 +205,7 @@ namespace void uploadPart(size_t part_offset, size_t part_size) { - LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, Size: {}", dest_container, dest_blob, part_size); + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, Size: {}", dest_container_for_logging, dest_blob, part_size); if (!part_size) { @@ -287,7 +284,7 @@ namespace std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race task.block_id = block_id; - LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, block_id: {}, Parts: {}", dest_container, dest_blob, block_id, bg_tasks.size()); + LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, block_id: {}, Parts: {}", dest_container_for_logging, dest_blob, block_id, bg_tasks.size()); } String processUploadPartRequest(UploadPartTask & task) @@ -331,14 +328,13 @@ void copyDataToAzureBlobStorageFile( size_t offset, size_t size, MultiVersion & dest_client, - const String & dest_container, + const String & dest_container_for_logging, const String & dest_blob, MultiVersion settings, - const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyDataToAzureBlobStorageFile")}; helper.performCopy(); } @@ -346,15 +342,14 @@ void copyDataToAzureBlobStorageFile( void copyAzureBlobStorageFile( MultiVersion & src_client, MultiVersion & dest_client, - const String & src_container, + const String & src_container_for_logging, const String & src_blob, size_t offset, size_t size, - const String & dest_container, + const String & dest_container_for_logging, const String & dest_blob, MultiVersion settings, const ReadSettings & read_settings, - const std::optional> & object_metadata, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { @@ -390,14 +385,14 @@ void copyAzureBlobStorageFile( } else { - LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container, src_blob); + LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container_for_logging, src_blob); auto create_read_buffer = [&] { return std::make_unique(src_client.get(), src_blob, read_settings, settings.get()->max_single_read_retries, settings.get()->max_single_download_retries); }; - UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container, dest_blob, settings, object_metadata, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; + UploadHelper helper{create_read_buffer, dest_client, offset, size, dest_container_for_logging, dest_blob, settings, schedule, for_disk_azure_blob_storage, &Poco::Logger::get("copyAzureBlobStorageFile")}; helper.performCopy(); } } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 491f7cd7176..15a31031f63 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -23,15 +23,14 @@ using CreateReadBuffer = std::function()>; void copyAzureBlobStorageFile( MultiVersion & src_client, MultiVersion & dest_client, - const String & src_container, + const String & src_container_for_logging, const String & src_blob, size_t src_offset, size_t src_size, - const String & dest_container, + const String & dest_container_for_logging, const String & dest_blob, MultiVersion settings, const ReadSettings & read_settings, - const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, bool for_disk_azure_blob_storage = false); @@ -46,10 +45,9 @@ void copyDataToAzureBlobStorageFile( size_t offset, size_t size, MultiVersion & client, - const String & dest_container, + const String & dest_container_for_logging, const String & dest_blob, MultiVersion settings, - const std::optional> & object_metadata = std::nullopt, ThreadPoolCallbackRunner schedule_ = {}, bool for_disk_azure_blob_storage = false); From ad196dd047e443158b18b8dfc52d1cf2d14d6593 Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Thu, 25 Jan 2024 01:18:27 +0200 Subject: [PATCH 138/884] Update 00937_format_schema_rows_template.sh fix failing shellcheck --- tests/queries/0_stateless/00937_format_schema_rows_template.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh index 3124cc3b52b..8b512513d94 100755 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.sh +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -26,8 +26,7 @@ echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Like $CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ -format_template_rows_between_delimiter = ';\n'"; --- { serverError 474 } +format_template_rows_between_delimiter = ';\n'; --{ serverError 474 }" $CLICKHOUSE_CLIENT --query="DROP TABLE template"; rm "$CURDIR"/00937_template_output_format_row.tmp From 3e3ae52acaedc57b9470f5b59c45307a7e048068 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Wed, 24 Jan 2024 14:35:35 +0000 Subject: [PATCH 139/884] Reduce even more memory --- programs/keeper-converter/KeeperConverter.cpp | 42 ++-- programs/keeper/CMakeLists.txt | 2 +- src/Coordination/KeeperSnapshotManager.cpp | 82 ++++--- src/Coordination/KeeperStorage.cpp | 223 +++++++++++------- src/Coordination/KeeperStorage.h | 153 ++++++++++-- src/Coordination/SnapshotableHashTable.h | 34 +-- src/Coordination/ZooKeeperDataReader.cpp | 2 +- 7 files changed, 353 insertions(+), 185 deletions(-) diff --git a/programs/keeper-converter/KeeperConverter.cpp b/programs/keeper-converter/KeeperConverter.cpp index 20448aafa2f..99f8bab3403 100644 --- a/programs/keeper-converter/KeeperConverter.cpp +++ b/programs/keeper-converter/KeeperConverter.cpp @@ -38,31 +38,31 @@ int mainEntryClickHouseKeeperConverter(int argc, char ** argv) return 0; } - try - { - auto keeper_context = std::make_shared(true); - keeper_context->setDigestEnabled(true); - keeper_context->setSnapshotDisk(std::make_shared("Keeper-snapshots", options["output-dir"].as())); + //try + //{ + // auto keeper_context = std::make_shared(true); + // keeper_context->setDigestEnabled(true); + // keeper_context->setSnapshotDisk(std::make_shared("Keeper-snapshots", options["output-dir"].as())); - DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false); + // DB::KeeperStorage storage(/* tick_time_ms */ 500, /* superdigest */ "", keeper_context, /* initialize_system_nodes */ false); - DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as(), logger); - storage.initializeSystemNodes(); + // DB::deserializeKeeperStorageFromSnapshotsDir(storage, options["zookeeper-snapshots-dir"].as(), logger); + // storage.initializeSystemNodes(); - DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as(), logger); - DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(storage.getZXID(), 1, std::make_shared()); - DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta); + // DB::deserializeLogsAndApplyToStorage(storage, options["zookeeper-logs-dir"].as(), logger); + // DB::SnapshotMetadataPtr snapshot_meta = std::make_shared(storage.getZXID(), 1, std::make_shared()); + // DB::KeeperStorageSnapshot snapshot(&storage, snapshot_meta); - DB::KeeperSnapshotManager manager(1, keeper_context); - auto snp = manager.serializeSnapshotToBuffer(snapshot); - auto file_info = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID()); - std::cout << "Snapshot serialized to path:" << fs::path(file_info.disk->getPath()) / file_info.path << std::endl; - } - catch (...) - { - std::cerr << getCurrentExceptionMessage(true) << '\n'; - return getCurrentExceptionCode(); - } + // DB::KeeperSnapshotManager manager(1, keeper_context); + // auto snp = manager.serializeSnapshotToBuffer(snapshot); + // auto file_info = manager.serializeSnapshotBufferToDisk(*snp, storage.getZXID()); + // std::cout << "Snapshot serialized to path:" << fs::path(file_info.disk->getPath()) / file_info.path << std::endl; + //} + //catch (...) + //{ + // std::cerr << getCurrentExceptionMessage(true) << '\n'; + // return getCurrentExceptionCode(); + //} return 0; } diff --git a/programs/keeper/CMakeLists.txt b/programs/keeper/CMakeLists.txt index 143ded0ee85..fba9b3e4d86 100644 --- a/programs/keeper/CMakeLists.txt +++ b/programs/keeper/CMakeLists.txt @@ -44,7 +44,7 @@ if (BUILD_STANDALONE_KEEPER) ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SessionExpiryQueue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/SummingStateMachine.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/WriteBufferFromNuraftBuffer.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/ZooKeeperDataReader.cpp + #${CMAKE_CURRENT_SOURCE_DIR}/../../src/Coordination/ZooKeeperDataReader.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/SettingsFields.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../../src/Core/BaseSettings.cpp diff --git a/src/Coordination/KeeperSnapshotManager.cpp b/src/Coordination/KeeperSnapshotManager.cpp index ee5935015e4..2f51e855763 100644 --- a/src/Coordination/KeeperSnapshotManager.cpp +++ b/src/Coordination/KeeperSnapshotManager.cpp @@ -79,20 +79,21 @@ namespace writeBinary(false, out); /// Serialize stat - writeBinary(node.stat.czxid, out); - writeBinary(node.stat.mzxid, out); - writeBinary(node.stat.ctime, out); - writeBinary(node.stat.mtime, out); - writeBinary(node.stat.version, out); - writeBinary(node.stat.cversion, out); - writeBinary(node.stat.aversion, out); - writeBinary(node.stat.ephemeralOwner, out); + writeBinary(node.czxid, out); + writeBinary(node.mzxid, out); + writeBinary(node.ctime(), out); + writeBinary(node.mtime(), out); + writeBinary(node.version, out); + writeBinary(node.cversion, out); + writeBinary(node.aversion, out); + const bool is_ephemeral = node.isEphemeral(); + writeBinary(is_ephemeral ? node.ephemeralOwner() : 0, out); if (version < SnapshotVersion::V6) - writeBinary(static_cast(node.getData().size()), out); - writeBinary(node.stat.numChildren, out); - writeBinary(node.stat.pzxid, out); + writeBinary(static_cast(node.data_size), out); + writeBinary(is_ephemeral ? 0 : node.numChildren(), out); + writeBinary(node.pzxid, out); - writeBinary(node.seq_num, out); + writeBinary(node.seqNum(), out); if (version >= SnapshotVersion::V4 && version <= SnapshotVersion::V5) writeBinary(node.sizeInBytes(), out); @@ -102,7 +103,7 @@ namespace { String new_data; readBinary(new_data, in); - node.setData(std::move(new_data)); + node.setData(new_data); if (version >= SnapshotVersion::V1) { @@ -138,22 +139,41 @@ namespace } /// Deserialize stat - readBinary(node.stat.czxid, in); - readBinary(node.stat.mzxid, in); - readBinary(node.stat.ctime, in); - readBinary(node.stat.mtime, in); - readBinary(node.stat.version, in); - readBinary(node.stat.cversion, in); - readBinary(node.stat.aversion, in); - readBinary(node.stat.ephemeralOwner, in); + readBinary(node.czxid, in); + readBinary(node.mzxid, in); + int64_t ctime; + readBinary(ctime, in); + node.setCtime(ctime); + int64_t mtime; + readBinary(mtime, in); + node.setMtime(mtime); + readBinary(node.version, in); + readBinary(node.cversion, in); + readBinary(node.aversion, in); + int64_t ephemeral_owner = 0; + readBinary(ephemeral_owner, in); + if (ephemeral_owner != 0) + { + node.is_ephemeral_and_mtime.is_ephemeral = true; + node.ephemeral_or_children_data.ephemeral_owner = ephemeral_owner; + } + if (version < SnapshotVersion::V6) { int32_t data_length = 0; readBinary(data_length, in); } - readBinary(node.stat.numChildren, in); - readBinary(node.stat.pzxid, in); - readBinary(node.seq_num, in); + int32_t num_children; + readBinary(num_children, in); + if (num_children) + node.ephemeral_or_children_data.children_info.num_children = num_children; + + readBinary(node.pzxid, in); + + int32_t seq_num; + readBinary(seq_num, in); + if (seq_num) + node.ephemeral_or_children_data.children_info.seq_num = seq_num; if (version >= SnapshotVersion::V4 && version <= SnapshotVersion::V5) { @@ -238,7 +258,7 @@ void KeeperStorageSnapshot::serialize(const KeeperStorageSnapshot & snapshot, Wr /// Benign race condition possible while taking snapshot: NuRaft decide to create snapshot at some log id /// and only after some time we lock storage and enable snapshot mode. So snapshot_container_size can be /// slightly bigger than required. - if (node.stat.mzxid > snapshot.zxid) + if (node.mzxid > snapshot.zxid) break; writeBinary(path, out); @@ -363,9 +383,9 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial if (recalculate_digest) storage.nodes_digest = 0; - const auto is_node_empty = [](const auto & node) + const auto is_node_empty = [](const auto & /*node*/) { - return node.getData().empty() && node.stat == KeeperStorage::Node::Stat{}; + return false; //node.getData().empty() && node == KeeperStorage::Node{}; }; for (size_t nodes_read = 0; nodes_read < snapshot_container_size; ++nodes_read) @@ -412,8 +432,8 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial } storage.container.insertOrReplace(path, node); - if (node.stat.ephemeralOwner != 0) - storage.ephemerals[node.stat.ephemeralOwner].insert(path); + if (node.isEphemeral()) + storage.ephemerals[node.ephemeralOwner()].insert(path); if (recalculate_digest) storage.nodes_digest += node.getDigest(path); @@ -433,12 +453,12 @@ void KeeperStorageSnapshot::deserialize(SnapshotDeserializationResult & deserial { if (itr.key != "/") { - if (itr.value.stat.numChildren != static_cast(itr.value.getChildren().size())) + if (itr.value.numChildren() != static_cast(itr.value.getChildren().size())) { #ifdef NDEBUG /// TODO (alesapin) remove this, it should be always CORRUPTED_DATA. LOG_ERROR(&Poco::Logger::get("KeeperSnapshotManager"), "Children counter in stat.numChildren {}" - " is different from actual children size {} for node {}", itr.value.stat.numChildren, itr.value.getChildren().size(), itr.key); + " is different from actual children size {} for node {}", itr.value.numChildren(), itr.value.getChildren().size(), itr.key); #else throw Exception(ErrorCodes::LOGICAL_ERROR, "Children counter in stat.numChildren {}" " is different from actual children size {} for node {}", diff --git a/src/Coordination/KeeperStorage.cpp b/src/Coordination/KeeperStorage.cpp index c128d7c2f98..6cdbedc2dc6 100644 --- a/src/Coordination/KeeperStorage.cpp +++ b/src/Coordination/KeeperStorage.cpp @@ -166,56 +166,88 @@ KeeperStorage::ResponsesForSessions processWatchesImpl( } // When this function is updated, update CURRENT_DIGEST_VERSION!! -uint64_t calculateDigest(std::string_view path, std::string_view data, const KeeperStorage::Node::Stat & stat) +uint64_t calculateDigest(std::string_view path, const KeeperStorage::Node & node) { SipHash hash; hash.update(path); - hash.update(data); + hash.update(node.data, node.data_size); - hash.update(stat.czxid); - hash.update(stat.czxid); - hash.update(stat.mzxid); - hash.update(stat.ctime); - hash.update(stat.mtime); - hash.update(stat.version); - hash.update(stat.cversion); - hash.update(stat.aversion); - hash.update(stat.ephemeralOwner); - hash.update(data.length()); - hash.update(stat.numChildren); - hash.update(stat.pzxid); + hash.update(node.czxid); + hash.update(node.czxid); + hash.update(node.mzxid); + hash.update(node.ctime()); + hash.update(node.mtime()); + hash.update(node.version); + hash.update(node.cversion); + hash.update(node.aversion); + bool is_ephemeral = node.isEphemeral(); + hash.update(is_ephemeral ? node.ephemeral_or_children_data.ephemeral_owner : 0); + hash.update(node.data_size); + hash.update(is_ephemeral ? 0 : node.ephemeral_or_children_data.children_info.num_children); + hash.update(node.pzxid); return hash.get64(); } } +void KeeperStorage::Node::copyStats(const Coordination::Stat & stat) +{ + czxid = stat.czxid; + mzxid = stat.mzxid; + pzxid = stat.pzxid; + + setCtime(stat.ctime); + setMtime(stat.mtime); + + version = stat.version; + cversion = stat.cversion; + aversion = stat.aversion; + + if (stat.ephemeralOwner == 0) + { + is_ephemeral_and_mtime.is_ephemeral = false; + ephemeral_or_children_data.children_info.num_children = stat.numChildren; + } + else + { + is_ephemeral_and_mtime.is_ephemeral = true; + ephemeral_or_children_data.ephemeral_owner = stat.ephemeralOwner; + } +} + void KeeperStorage::Node::setResponseStat(Coordination::Stat & response_stat) const { - response_stat.czxid = stat.czxid; - response_stat.mzxid = stat.mzxid; - response_stat.ctime = stat.ctime; - response_stat.mtime = stat.mtime; - response_stat.version = stat.version; - response_stat.cversion = stat.cversion; - response_stat.aversion = stat.aversion; - response_stat.ephemeralOwner = stat.ephemeralOwner; - response_stat.dataLength = static_cast(data.size()); - response_stat.numChildren = stat.numChildren; - response_stat.pzxid = stat.pzxid; + response_stat.czxid = czxid; + response_stat.mzxid = mzxid; + response_stat.ctime = ctime(); + response_stat.mtime = mtime(); + response_stat.version = version; + response_stat.cversion = cversion; + response_stat.aversion = aversion; + bool is_ephemeral = isEphemeral(); + response_stat.ephemeralOwner = is_ephemeral ? ephemeral_or_children_data.ephemeral_owner : 0; + response_stat.dataLength = static_cast(data_size); + response_stat.numChildren = is_ephemeral ? 0 : numChildren(); + response_stat.pzxid = pzxid; } uint64_t KeeperStorage::Node::sizeInBytes() const { - return sizeof(Node) + children.size() * sizeof(StringRef) + data.size(); + return sizeof(Node) + children.size() * sizeof(StringRef) + data_size; } -void KeeperStorage::Node::setData(String new_data) +void KeeperStorage::Node::setData(const String & new_data) { - data = std::move(new_data); + data_size = static_cast(new_data.size()); + if (data_size != 0) + { + data = new char[new_data.size()]; + memcpy(data, new_data.data(), data_size); + } } void KeeperStorage::Node::addChild(StringRef child_path) @@ -230,15 +262,15 @@ void KeeperStorage::Node::removeChild(StringRef child_path) void KeeperStorage::Node::invalidateDigestCache() const { - has_cached_digest = false; + has_cached_digest_and_ctime.has_cached_digest = false; } UInt64 KeeperStorage::Node::getDigest(const std::string_view path) const { - if (!has_cached_digest) + if (!has_cached_digest_and_ctime.has_cached_digest) { - cached_digest = calculateDigest(path, data, stat); - has_cached_digest = true; + cached_digest = calculateDigest(path, *this); + has_cached_digest_and_ctime.has_cached_digest = true; } return cached_digest; @@ -246,9 +278,28 @@ UInt64 KeeperStorage::Node::getDigest(const std::string_view path) const void KeeperStorage::Node::shallowCopy(const KeeperStorage::Node & other) { - stat = other.stat; - seq_num = other.seq_num; - setData(other.getData()); + czxid = other.czxid; + mzxid = other.mzxid; + pzxid = other.pzxid; + acl_id = other.acl_id; /// 0 -- no ACL by default + + has_cached_digest_and_ctime = other.has_cached_digest_and_ctime; + + is_ephemeral_and_mtime = other.is_ephemeral_and_mtime; + + ephemeral_or_children_data = other.ephemeral_or_children_data; + + data_size = other.data_size; + if (data_size != 0) + { + data = new char[data_size]; + memcpy(data, other.data, data_size); + } + + version = other.version; + cversion = other.cversion; + aversion = other.aversion; + cached_digest = other.cached_digest; } @@ -284,9 +335,9 @@ void KeeperStorage::initializeSystemNodes() removeDigest(current_root_it->value, "/"); auto updated_root_it = container.updateValue( "/", - [](auto & node) + [](KeeperStorage::Node & node) { - ++node.stat.numChildren; + node.increaseNumChildren(); node.addChild(getBaseNodeName(keeper_system_path)); } ); @@ -359,7 +410,7 @@ void KeeperStorage::UncommittedState::applyDelta(const Delta & delta) { assert(!node); node = std::make_shared(); - node->stat = operation.stat; + node->copyStats(operation.stat); node->setData(operation.data); acls = operation.acls; last_applied_zxid = delta.zxid; @@ -673,7 +724,7 @@ Coordination::Error KeeperStorage::commit(int64_t commit_zxid) if (node_it == container.end()) onStorageInconsistency(); - if (operation.version != -1 && operation.version != node_it->value.stat.version) + if (operation.version != -1 && operation.version != node_it->value.version) onStorageInconsistency(); removeDigest(node_it->value, path); @@ -695,7 +746,7 @@ Coordination::Error KeeperStorage::commit(int64_t commit_zxid) if (node_it == container.end()) onStorageInconsistency(); - if (operation.version != -1 && operation.version != node_it->value.stat.aversion) + if (operation.version != -1 && operation.version != node_it->value.aversion) onStorageInconsistency(); acl_map.removeUsage(node_it->value.acl_id); @@ -740,7 +791,7 @@ Coordination::Error KeeperStorage::commit(int64_t commit_zxid) bool KeeperStorage::createNode( const std::string & path, String data, - const KeeperStorage::Node::Stat & stat, + const Coordination::Stat & stat, Coordination::ACLs node_acls) { auto parent_path = parentNodePath(path); @@ -749,7 +800,7 @@ bool KeeperStorage::createNode( if (node_it == container.end()) return false; - if (node_it->value.stat.ephemeralOwner != 0) + if (node_it->value.isEphemeral()) return false; if (container.contains(path)) @@ -761,8 +812,8 @@ bool KeeperStorage::createNode( acl_map.addUsage(acl_id); created_node.acl_id = acl_id; - created_node.stat = stat; - created_node.setData(std::move(data)); + created_node.copyStats(stat); + created_node.setData(data); auto [map_key, _] = container.insert(path, created_node); /// Take child path from key owned by map. auto child_path = getBaseNodeName(map_key->getKey()); @@ -771,7 +822,7 @@ bool KeeperStorage::createNode( [child_path](KeeperStorage::Node & parent) { parent.addChild(child_path); - chassert(parent.stat.numChildren == static_cast(parent.getChildren().size())); + chassert(parent.numChildren() == static_cast(parent.getChildren().size())); } ); @@ -785,21 +836,22 @@ bool KeeperStorage::removeNode(const std::string & path, int32_t version) if (node_it == container.end()) return false; - if (version != -1 && version != node_it->value.stat.version) + if (version != -1 && version != node_it->value.version) return false; - if (node_it->value.stat.numChildren) + if (node_it->value.numChildren()) return false; - auto prev_node = node_it->value; - acl_map.removeUsage(prev_node.acl_id); + KeeperStorage::Node prev_node; + prev_node.shallowCopy(node_it->value); + acl_map.removeUsage(node_it->value.acl_id); container.updateValue( parentNodePath(path), [child_basename = getBaseNodeName(node_it->key)](KeeperStorage::Node & parent) { parent.removeChild(child_basename); - chassert(parent.stat.numChildren == static_cast(parent.getChildren().size())); + chassert(parent.numChildren() == static_cast(parent.getChildren().size())); } ); @@ -959,7 +1011,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr if (parent_node == nullptr) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; - else if (parent_node->stat.ephemeralOwner != 0) + else if (parent_node->isEphemeral()) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNOCHILDRENFOREPHEMERALS}}; std::string path_created = request.path; @@ -968,7 +1020,7 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr if (request.not_exists) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADARGUMENTS}}; - auto seq_num = parent_node->seq_num; + auto seq_num = parent_node->seqNum(); std::stringstream seq_num_str; // STYLE_CHECK_ALLOW_STD_STRING_STREAM seq_num_str.exceptions(std::ios::failbit); @@ -1008,20 +1060,20 @@ struct KeeperStorageCreateRequestProcessor final : public KeeperStorageRequestPr auto parent_update = [parent_cversion, zxid](KeeperStorage::Node & node) { /// Increment sequential number even if node is not sequential - ++node.seq_num; + node.increaseSeqNum(); if (parent_cversion == -1) - ++node.stat.cversion; - else if (parent_cversion > node.stat.cversion) - node.stat.cversion = parent_cversion; + ++node.cversion; + else if (parent_cversion > node.cversion) + node.cversion = parent_cversion; - if (zxid > node.stat.pzxid) - node.stat.pzxid = zxid; - ++node.stat.numChildren; + if (zxid > node.pzxid) + node.pzxid = zxid; + node.increaseNumChildren(); }; new_deltas.emplace_back(std::string{parent_path}, zxid, KeeperStorage::UpdateNodeDelta{std::move(parent_update)}); - KeeperStorage::Node::Stat stat; + Coordination::Stat stat; stat.czxid = zxid; stat.mzxid = zxid; stat.pzxid = zxid; @@ -1135,7 +1187,8 @@ struct KeeperStorageGetRequestProcessor final : public KeeperStorageRequestProce else { node_it->value.setResponseStat(response.stat); - response.data = node_it->value.getData(); + auto data = node_it->value.getData(); + response.data = std::string(data.data, data.size); response.error = Coordination::Error::ZOK; } @@ -1192,8 +1245,8 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr { [zxid](KeeperStorage::Node & parent) { - if (parent.stat.pzxid < zxid) - parent.stat.pzxid = zxid; + if (parent.pzxid < zxid) + parent.pzxid = zxid; } } ); @@ -1207,9 +1260,9 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr update_parent_pzxid(); return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; } - else if (request.version != -1 && request.version != node->stat.version) + else if (request.version != -1 && request.version != node->version) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; - else if (node->stat.numChildren != 0) + else if (node->numChildren() != 0) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNOTEMPTY}}; if (request.restored_from_zookeeper_log) @@ -1220,14 +1273,14 @@ struct KeeperStorageRemoveRequestProcessor final : public KeeperStorageRequestPr zxid, KeeperStorage::UpdateNodeDelta{[](KeeperStorage::Node & parent) { - ++parent.stat.cversion; - --parent.stat.numChildren; + ++parent.cversion; + --parent.ephemeral_or_children_data.children_info.num_children; }}); - new_deltas.emplace_back(request.path, zxid, KeeperStorage::RemoveNodeDelta{request.version, node->stat.ephemeralOwner}); + new_deltas.emplace_back(request.path, zxid, KeeperStorage::RemoveNodeDelta{request.version, node->ephemeralOwner()}); - if (node->stat.ephemeralOwner != 0) - storage.unregisterEphemeralPath(node->stat.ephemeralOwner, request.path); + if (node->isEphemeral()) + storage.unregisterEphemeralPath(node->ephemeralOwner(), request.path); digest = storage.calculateNodesDigest(digest, new_deltas); @@ -1341,7 +1394,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce auto node = storage.uncommitted_state.getNode(request.path); - if (request.version != -1 && request.version != node->stat.version) + if (request.version != -1 && request.version != node->version) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; new_deltas.emplace_back( @@ -1350,9 +1403,9 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce KeeperStorage::UpdateNodeDelta{ [zxid, data = request.data, time](KeeperStorage::Node & value) { - value.stat.version++; - value.stat.mzxid = zxid; - value.stat.mtime = time; + value.version++; + value.mzxid = zxid; + value.setMtime(time); value.setData(data); }, request.version}); @@ -1364,7 +1417,7 @@ struct KeeperStorageSetRequestProcessor final : public KeeperStorageRequestProce { [](KeeperStorage::Node & parent) { - parent.stat.cversion++; + parent.cversion++; } } ); @@ -1478,7 +1531,7 @@ struct KeeperStorageListRequestProcessor final : public KeeperStorageRequestProc if (child_it == container.end()) onStorageInconsistency(); - const auto is_ephemeral = child_it->value.stat.ephemeralOwner != 0; + const auto is_ephemeral = child_it->value.isEphemeral(); return (is_ephemeral && list_request_type == EPHEMERAL_ONLY) || (!is_ephemeral && list_request_type == PERSISTENT_ONLY); }; @@ -1531,7 +1584,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro auto node = storage.uncommitted_state.getNode(request.path); if (check_not_exists) { - if (node && (request.version == -1 || request.version == node->stat.version)) + if (node && (request.version == -1 || request.version == node->version)) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNODEEXISTS}}; } else @@ -1539,7 +1592,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro if (!node) return {KeeperStorage::Delta{zxid, Coordination::Error::ZNONODE}}; - if (request.version != -1 && request.version != node->stat.version) + if (request.version != -1 && request.version != node->version) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; } @@ -1575,7 +1628,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro if (check_not_exists) { - if (node_it != container.end() && (request.version == -1 || request.version == node_it->value.stat.version)) + if (node_it != container.end() && (request.version == -1 || request.version == node_it->value.version)) on_error(Coordination::Error::ZNODEEXISTS); else response.error = Coordination::Error::ZOK; @@ -1584,7 +1637,7 @@ struct KeeperStorageCheckRequestProcessor final : public KeeperStorageRequestPro { if (node_it == container.end()) on_error(Coordination::Error::ZNONODE); - else if (request.version != -1 && request.version != node_it->value.stat.version) + else if (request.version != -1 && request.version != node_it->value.version) on_error(Coordination::Error::ZBADVERSION); else response.error = Coordination::Error::ZOK; @@ -1637,7 +1690,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr auto node = uncommitted_state.getNode(request.path); - if (request.version != -1 && request.version != node->stat.aversion) + if (request.version != -1 && request.version != node->aversion) return {KeeperStorage::Delta{zxid, Coordination::Error::ZBADVERSION}}; @@ -1657,7 +1710,7 @@ struct KeeperStorageSetACLRequestProcessor final : public KeeperStorageRequestPr zxid, KeeperStorage::UpdateNodeDelta { - [](KeeperStorage::Node & n) { ++n.stat.aversion; } + [](KeeperStorage::Node & n) { ++n.aversion; } } } }; @@ -2075,7 +2128,7 @@ UInt64 KeeperStorage::calculateNodesDigest(UInt64 current_digest, const std::vec [&](const CreateNodeDelta & create_delta) { auto node = std::make_shared(); - node->stat = create_delta.stat; + node->copyStats(create_delta.stat); node->setData(create_delta.data); updated_nodes.emplace(delta.path, node); }, @@ -2198,8 +2251,8 @@ void KeeperStorage::preprocessRequest( { [ephemeral_path](Node & parent) { - ++parent.stat.cversion; - --parent.stat.numChildren; + ++parent.cversion; + --parent.ephemeral_or_children_data.children_info.num_children; } } ); diff --git a/src/Coordination/KeeperStorage.h b/src/Coordination/KeeperStorage.h index 01c1413a884..f14a6ed772c 100644 --- a/src/Coordination/KeeperStorage.h +++ b/src/Coordination/KeeperStorage.h @@ -35,40 +35,144 @@ public: /// New fields should be added to the struct only if it's really necessary struct Node { - /// to reduce size of the Node struct we use a custom Stat without dataLength - struct Stat - { - int64_t czxid{0}; - int64_t mzxid{0}; - int64_t ctime{0}; - int64_t mtime{0}; - int32_t version{0}; - int32_t cversion{0}; - int32_t aversion{0}; - int32_t numChildren{0}; /// NOLINT - int64_t ephemeralOwner{0}; /// NOLINT - int64_t pzxid{0}; - - bool operator==(const Stat &) const = default; - }; - + int64_t czxid{0}; + int64_t mzxid{0}; + int64_t pzxid{0}; uint64_t acl_id = 0; /// 0 -- no ACL by default - Stat stat{}; - int32_t seq_num = 0; + + mutable struct + { + bool has_cached_digest : 1; + int64_t ctime : 7; + } has_cached_digest_and_ctime{false, 0}; + + struct + { + bool is_ephemeral : 1; + int64_t mtime : 7; + } is_ephemeral_and_mtime{false, 0}; + + + union + { + int64_t ephemeral_owner; + struct + { + int32_t seq_num; + int32_t num_children; + } children_info; + } ephemeral_or_children_data{0}; + + char * data{nullptr}; + uint32_t data_size{0}; + + int32_t version{0}; + int32_t cversion{0}; + int32_t aversion{0}; /// we cannot use `std::optional because we want to /// pack the boolean with seq_num above - mutable bool has_cached_digest = false; mutable uint64_t cached_digest = 0; + ~Node() + { + if (data_size) + delete [] data; + } + + Node() = default; + + Node & operator=(const Node & other) + { + if (this == &other) + return *this; + + czxid = other.czxid; + mzxid = other.mzxid; + pzxid = other.pzxid; + acl_id = other.acl_id; + has_cached_digest_and_ctime = other.has_cached_digest_and_ctime; + is_ephemeral_and_mtime = other.is_ephemeral_and_mtime; + ephemeral_or_children_data = other.ephemeral_or_children_data; + data_size = other.data_size; + version = other.version; + cversion = other.cversion; + aversion = other.aversion; + + if (data_size != 0) + { + data = new char[data_size]; + memcpy(data, other.data, data_size); + } + return *this; + } + + Node(const Node & other) + { + *this = other; + } + + bool isEphemeral() const + { + + return is_ephemeral_and_mtime.is_ephemeral; + } + + int64_t ephemeralOwner() const + { + return isEphemeral() ? ephemeral_or_children_data.ephemeral_owner : 0; + } + + int32_t numChildren() const + { + return ephemeral_or_children_data.children_info.num_children; + } + + void increaseNumChildren() + { + ++ephemeral_or_children_data.children_info.num_children; + } + + int32_t seqNum() const + { + return ephemeral_or_children_data.children_info.seq_num; + } + + void increaseSeqNum() + { + ++ephemeral_or_children_data.children_info.seq_num; + } + + int64_t ctime() const + { + return has_cached_digest_and_ctime.ctime; + } + + void setCtime(uint64_t ctime) + { + has_cached_digest_and_ctime.ctime = ctime; + } + + int64_t mtime() const + { + return is_ephemeral_and_mtime.mtime; + } + + void setMtime(uint64_t mtime) + { + is_ephemeral_and_mtime.mtime = mtime; + } + + void copyStats(const Coordination::Stat & stat); + void setResponseStat(Coordination::Stat & response_stat) const; /// Object memory size uint64_t sizeInBytes() const; - void setData(String new_data); + void setData(const String & new_data); - const auto & getData() const noexcept { return data; } + StringRef getData() const noexcept { return {data, data_size}; } void addChild(StringRef child_path); @@ -87,7 +191,6 @@ public: // (e.g. we don't need to copy list of children) void shallowCopy(const Node & other); private: - String data; ChildrenSet children{}; }; @@ -177,7 +280,7 @@ public: // - quickly commit the changes to the storage struct CreateNodeDelta { - KeeperStorage::Node::Stat stat; + Coordination::Stat stat; Coordination::ACLs acls; String data; }; @@ -342,7 +445,7 @@ public: bool createNode( const std::string & path, String data, - const KeeperStorage::Node::Stat & stat, + const Coordination::Stat & stat, Coordination::ACLs node_acls); // Remove node in the storage diff --git a/src/Coordination/SnapshotableHashTable.h b/src/Coordination/SnapshotableHashTable.h index ac8d36745c2..716f08faf64 100644 --- a/src/Coordination/SnapshotableHashTable.h +++ b/src/Coordination/SnapshotableHashTable.h @@ -19,55 +19,47 @@ struct ListNode StringRef key; V value; - /// |* * ****** | - /// ^ ^ ^ - /// active_in_map free_key version - /// (1 byte) (1 byte) (6 bytes) - uint64_t node_metadata = 0; + struct + { + bool active_in_map : 1; + bool free_key : 1; + uint64_t version : 6; + } node_metadata{false, false, 0}; void setInactiveInMap() { - node_metadata &= ~active_in_map_mask; + node_metadata.active_in_map = false; } void setActiveInMap() { - node_metadata |= active_in_map_mask; + node_metadata.active_in_map = true; } bool isActiveInMap() { - return node_metadata & active_in_map_mask; + return node_metadata.active_in_map; } void setFreeKey() { - node_metadata |= free_key_mask; + node_metadata.free_key = true; } bool getFreeKey() { - return node_metadata & free_key_mask; + return node_metadata.free_key; } uint64_t getVersion() { - return node_metadata & version_mask; + return node_metadata.version; } void setVersion(uint64_t version) { - if (version > version_mask) - throw Exception( - ErrorCodes::LOGICAL_ERROR, "Snapshot version {} is larger than maximum allowed value {}", version, version_mask); - - node_metadata &= ~version_mask; - node_metadata |= version; + node_metadata.version = version; } - - static constexpr uint64_t active_in_map_mask = static_cast(1) << 63; - static constexpr uint64_t free_key_mask = static_cast(1) << 62; - static constexpr uint64_t version_mask = ~(static_cast(3) << 62); }; template diff --git a/src/Coordination/ZooKeeperDataReader.cpp b/src/Coordination/ZooKeeperDataReader.cpp index b55ebef327f..b4334893849 100644 --- a/src/Coordination/ZooKeeperDataReader.cpp +++ b/src/Coordination/ZooKeeperDataReader.cpp @@ -105,7 +105,7 @@ int64_t deserializeStorageData(KeeperStorage & storage, ReadBuffer & in, Poco::L Coordination::read(node.acl_id, in); /// Deserialize stat - Coordination::read(node.stat.czxid, in); + Coordination::read(node.src/Coordination/ZooKeeperDataReader.cppstat.czxid, in); Coordination::read(node.stat.mzxid, in); /// For some reason ZXID specified in filename can be smaller /// then actual zxid from nodes. In this case we will use zxid from nodes. From 8c7218bac2fa09356750e23e79ed686c879665b6 Mon Sep 17 00:00:00 2001 From: Antonio Andelic Date: Mon, 22 Jan 2024 09:40:19 +0000 Subject: [PATCH 140/884] Store latest logs inmemory --- src/Coordination/Changelog.cpp | 393 +++++++++++++++++++++++++-------- src/Coordination/Changelog.h | 48 +++- 2 files changed, 341 insertions(+), 100 deletions(-) diff --git a/src/Coordination/Changelog.cpp b/src/Coordination/Changelog.cpp index 7f1135eec94..c06a8bad91a 100644 --- a/src/Coordination/Changelog.cpp +++ b/src/Coordination/Changelog.cpp @@ -1,4 +1,5 @@ #include +#include #include #include #include @@ -49,9 +50,15 @@ void moveFileBetweenDisks(DiskPtr disk_from, ChangelogFileDescriptionPtr descrip } disk_from->copyFile(from_path, *disk_to, path_to, {}); disk_to->removeFile(tmp_changelog_name); + + /// a different thread could be trying to read from the file + /// we should make sure the source disk contains the file while read is in progress + { + std::lock_guard file_lock(description->file_mutex); + description->disk = disk_to; + } disk_from->removeFile(description->path); description->path = path_to; - description->disk = disk_to; } constexpr auto DEFAULT_PREFIX = "changelog"; @@ -111,9 +118,11 @@ class ChangelogWriter public: ChangelogWriter( std::map & existing_changelogs_, + LogEntryStorage & entry_storage_, KeeperContextPtr keeper_context_, LogFileSettings log_file_settings_) : existing_changelogs(existing_changelogs_) + , entry_storage(entry_storage_) , log_file_settings(log_file_settings_) , keeper_context(std::move(keeper_context_)) , log(&Poco::Logger::get("Changelog")) @@ -238,6 +247,7 @@ public: } auto & write_buffer = getBuffer(); + auto current_position = write_buffer.count(); writeIntBinary(computeRecordChecksum(record), write_buffer); writeIntBinary(record.header.version, write_buffer); @@ -255,6 +265,11 @@ public: /// Flush compressed data to file buffer compressed_buffer->next(); } + else + { + unflushed_indices_with_log_location.emplace_back( + record.header.index, LogLocation{.file_description = current_file_description, .position = current_position}); + } last_index_written = record.header.index; @@ -272,6 +287,8 @@ public: else file_buffer->next(); } + entry_storage.addLogLocations(std::move(unflushed_indices_with_log_location)); + unflushed_indices_with_log_location.clear(); } uint64_t getStartIndex() const @@ -314,9 +331,9 @@ public: private: void finalizeCurrentFile() { - assert(prealloc_done); + chassert(prealloc_done); - assert(current_file_description); + chassert(current_file_description); // compact can delete the file and we don't need to do anything if (current_file_description->deleted) { @@ -400,9 +417,11 @@ private: { const auto * file_buffer = tryGetFileBuffer(); + if (file_buffer) + initial_file_size = getSizeFromFileDescriptor(file_buffer->getFD()); + if (log_file_settings.max_size == 0 || !file_buffer) { - initial_file_size = 0; prealloc_done = true; return; } @@ -428,7 +447,6 @@ private: } } #endif - initial_file_size = getSizeFromFileDescriptor(file_buffer->getFD()); prealloc_done = true; } @@ -441,6 +459,10 @@ private: std::map & existing_changelogs; + LogEntryStorage & entry_storage; + + std::vector> unflushed_indices_with_log_location; + ChangelogFileDescriptionPtr current_file_description{nullptr}; std::unique_ptr file_buf; std::optional last_index_written; @@ -482,69 +504,88 @@ struct ChangelogReadResult bool error; }; +namespace +{ + +ChangelogRecord readChangelogRecord(ReadBuffer & read_buf, const std::string & filepath) +{ + /// Read checksum + Checksum record_checksum; + readIntBinary(record_checksum, read_buf); + + /// Read header + ChangelogRecord record; + readIntBinary(record.header.version, read_buf); + readIntBinary(record.header.index, read_buf); + readIntBinary(record.header.term, read_buf); + readIntBinary(record.header.value_type, read_buf); + readIntBinary(record.header.blob_size, read_buf); + + if (record.header.version > CURRENT_CHANGELOG_VERSION) + throw Exception( + ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", static_cast(record.header.version), filepath); + + /// Read data + if (record.header.blob_size != 0) + { + auto buffer = nuraft::buffer::alloc(record.header.blob_size); + auto * buffer_begin = reinterpret_cast(buffer->data_begin()); + read_buf.readStrict(buffer_begin, record.header.blob_size); + record.blob = buffer; + } + else + record.blob = nullptr; + + /// Compare checksums + Checksum checksum = computeRecordChecksum(record); + if (checksum != record_checksum) + { + throw Exception( + ErrorCodes::CHECKSUM_DOESNT_MATCH, + "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", + filepath, + record.header.version, + record.header.index, + record.header.blob_size); + } + + return record; +} + +LogEntryPtr logEntryFromRecord(const ChangelogRecord & record) +{ + return nuraft::cs_new(record.header.term, record.blob, static_cast(record.header.value_type)); +} + +} + class ChangelogReader { public: - explicit ChangelogReader(DiskPtr disk_, const std::string & filepath_) : disk(disk_), filepath(filepath_) + explicit ChangelogReader(ChangelogFileDescriptionPtr changelog_description_) : changelog_description(changelog_description_) { - compression_method = chooseCompressionMethod(filepath, ""); - auto read_buffer_from_file = disk->readFile(filepath); + compression_method = chooseCompressionMethod(changelog_description->path, ""); + auto read_buffer_from_file = changelog_description->disk->readFile(changelog_description->path); read_buf = wrapReadBufferWithCompressionMethod(std::move(read_buffer_from_file), compression_method); } /// start_log_index -- all entries with index < start_log_index will be skipped, but accounted into total_entries_read_from_log - ChangelogReadResult readChangelog(IndexToLogEntry & logs, uint64_t start_log_index, Poco::Logger * log) + ChangelogReadResult readChangelog(LogEntryStorage & entry_storage, uint64_t start_log_index, Poco::Logger * log) { ChangelogReadResult result{}; result.compressed_log = compression_method != CompressionMethod::None; + const auto & filepath = changelog_description->path; try { while (!read_buf->eof()) { result.last_position = read_buf->count(); - /// Read checksum - Checksum record_checksum; - readIntBinary(record_checksum, *read_buf); - /// Read header - ChangelogRecord record; - readIntBinary(record.header.version, *read_buf); - readIntBinary(record.header.index, *read_buf); - readIntBinary(record.header.term, *read_buf); - readIntBinary(record.header.value_type, *read_buf); - readIntBinary(record.header.blob_size, *read_buf); - - if (record.header.version > CURRENT_CHANGELOG_VERSION) - throw Exception( - ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unsupported changelog version {} on path {}", static_cast(record.header.version), filepath); - - /// Read data - if (record.header.blob_size != 0) - { - auto buffer = nuraft::buffer::alloc(record.header.blob_size); - auto * buffer_begin = reinterpret_cast(buffer->data_begin()); - read_buf->readStrict(buffer_begin, record.header.blob_size); - record.blob = buffer; - } - else - record.blob = nullptr; - - /// Compare checksums - Checksum checksum = computeRecordChecksum(record); - if (checksum != record_checksum) - { - throw Exception( - ErrorCodes::CHECKSUM_DOESNT_MATCH, - "Checksums doesn't match for log {} (version {}), index {}, blob_size {}", - filepath, - record.header.version, - record.header.index, - record.header.blob_size); - } + auto record = readChangelogRecord(*read_buf, filepath); /// Check for duplicated changelog ids - if (logs.contains(record.header.index)) - std::erase_if(logs, [&record](const auto & item) { return item.first >= record.header.index; }); + if (entry_storage.contains(record.header.index)) + entry_storage.eraseIf([&record](const auto index) { return index >= record.header.index; }); result.total_entries_read_from_log += 1; @@ -553,12 +594,15 @@ public: continue; /// Create log entry for read data - auto log_entry = nuraft::cs_new(record.header.term, record.blob, static_cast(record.header.value_type)); + auto log_entry = logEntryFromRecord(record); if (result.first_read_index == 0) result.first_read_index = record.header.index; /// Put it into in memory structure - logs.emplace(record.header.index, log_entry); + entry_storage.addEntryWithLocation( + record.header.index, + log_entry, + LogLocation{.file_description = changelog_description, .position = static_cast(result.last_position)}); result.last_read_index = record.header.index; if (result.total_entries_read_from_log % 50000 == 0) @@ -585,12 +629,189 @@ public: } private: - DiskPtr disk; - std::string filepath; + ChangelogFileDescriptionPtr changelog_description; CompressionMethod compression_method; std::unique_ptr read_buf; }; +size_t LogEntryStorage::size() const +{ + return total_entries; +} + +void LogEntryStorage::addEntry(uint64_t index, const LogEntryPtr & log_entry) +{ + logs_cache.insert_or_assign(index, log_entry); + if (logs_cache.size() == 1) + min_index_in_cache = index; + + ++total_entries; +} + +void LogEntryStorage::addEntryWithLocation(uint64_t index, const LogEntryPtr & log_entry, LogLocation log_location) +{ + logs_cache.emplace(index, log_entry); + logs_location.emplace(index, std::move(log_location)); + if (logs_cache.size() == 1) + min_index_in_cache = index; + else if (logs_cache.size() > 1000) + { + logs_cache.erase(min_index_in_cache); + ++min_index_in_cache; + } +} + +void LogEntryStorage::eraseIf(std::function index_predicate) +{ + std::erase_if(logs_cache, [&](const auto & item) { return index_predicate(item.first); }); +} + +bool LogEntryStorage::contains(uint64_t index) const +{ + return logs_cache.contains(index); +} + +LogEntryPtr LogEntryStorage::getEntry(uint64_t index) const +{ + if (index >= min_index_in_cache) + return logs_cache.at(index); + + std::lock_guard lock(logs_location_mutex); + + if (auto it = logs_location.find(index); it != logs_location.end()) + { + const auto & [changelog_description, position] = it->second; + std::lock_guard file_lock(changelog_description->file_mutex); + //std::cout << "Reading from path " << changelog_description->path << std::endl; + auto file = changelog_description->disk->readFile(changelog_description->path); + file->seek(position, SEEK_SET); + + auto record = readChangelogRecord(*file, changelog_description->path); + return logEntryFromRecord(record); + } + else + std::cout << "Nothing found" << std::endl; + + return nullptr; +} + +void LogEntryStorage::clear() +{ + logs_cache.clear(); +} + +LogEntryPtr LogEntryStorage::getLatestConfigChange() const +{ + for (const auto & [_, entry] : logs_cache) + if (entry->get_val_type() == nuraft::conf) + return entry; + return nullptr; +} + +void LogEntryStorage::addLogLocations(std::vector> indices_with_log_locations) +{ + std::lock_guard lock(logs_location_mutex); + unapplied_indices_with_log_locations.insert( + unapplied_indices_with_log_locations.end(), + std::make_move_iterator(indices_with_log_locations.begin()), + std::make_move_iterator(indices_with_log_locations.end())); +} + +void LogEntryStorage::refreshCache() +{ + if (logs_cache.size() <= 1000) + return; + + std::lock_guard lock(logs_location_mutex); + if (logs_location.empty()) + return; + + auto max_index_to_remove = min_index_in_cache + (logs_cache.size() - 1000); + for (auto & [index, log_location] : unapplied_indices_with_log_locations) + { + logs_location.emplace(index, std::move(log_location)); + max_index_with_location = index; + } + + for (size_t index = min_index_in_cache; index < max_index_to_remove; ++index) + { + if (index <= max_index_with_location) + { + logs_cache.erase(index); + min_index_in_cache = index + 1; + } + } + + unapplied_indices_with_log_locations.clear(); +} + +LogEntriesPtr LogEntryStorage::getLogEntriesBetween(uint64_t start, uint64_t end) const +{ + LogEntriesPtr ret = nuraft::cs_new>>(); + ret->reserve(end - start); + + /// we rely on fact that changelogs need to be written sequentially with + /// no other writes between + struct ReadInfo + { + ChangelogFileDescriptionPtr file_description; + size_t start_position = 0; + size_t count = 0; + }; + + /// we have to collect some logs from disks because they are not cached + if (start < min_index_in_cache) + { + //std::cout << "Reading some from disk" << std::endl; + std::lock_guard logs_location_lock(logs_location_mutex); + std::vector read_infos; + for (uint64_t i = start; i < min_index_in_cache && i < end; ++i) + { + const auto & log_location = logs_location.at(i); + const auto push_new_file = [&] + { + read_infos.push_back(ReadInfo + { + .file_description = log_location.file_description, + .start_position = log_location.position, + .count = 1, + }); + }; + + if (read_infos.empty()) + push_new_file(); + else if (auto & last = read_infos.back(); log_location.file_description == last.file_description) + ++last.count; + else + push_new_file(); + } + + for (const auto & [file_description, start_position, count] : read_infos) + { + std::cout << "Reading from path " << file_description->path << " " << count << " entries" << std::endl; + std::lock_guard file_lock(file_description->file_mutex); + auto file = file_description->disk->readFile(file_description->path); + file->seek(start_position, SEEK_SET); + + for (size_t i = 0; i < count; ++i) + { + auto record = readChangelogRecord(*file, file_description->path); + ret->push_back(logEntryFromRecord(record)); + } + } + + start = min_index_in_cache; + } + else + std::cout << "Nothing read from disk" << std::endl; + + for (uint64_t i = start; i < end; ++i) + ret->push_back(logs_cache.at(i)); + + return ret; + +} + Changelog::Changelog( Poco::Logger * log_, LogFileSettings log_file_settings, FlushSettings flush_settings_, KeeperContextPtr keeper_context_) : changelogs_detached_dir("detached") @@ -706,7 +927,7 @@ Changelog::Changelog( append_completion_thread = ThreadFromGlobalPool([this] { appendCompletionThread(); }); - current_writer = std::make_unique(existing_changelogs, keeper_context, log_file_settings); + current_writer = std::make_unique(existing_changelogs, entry_storage, keeper_context, log_file_settings); } void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uint64_t logs_to_keep) @@ -783,8 +1004,8 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin break; } - ChangelogReader reader(changelog_description.disk, changelog_description.path); - last_log_read_result = reader.readChangelog(logs, start_to_read_from, log); + ChangelogReader reader(changelog_description_ptr); + last_log_read_result = reader.readChangelog(entry_storage, start_to_read_from, log); if (last_log_read_result->last_read_index != 0) last_read_index = last_log_read_result->last_read_index; @@ -861,13 +1082,13 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin remove_invalid_logs(); description->disk->removeFile(description->path); existing_changelogs.erase(last_log_read_result->log_start_index); - std::erase_if(logs, [last_log_read_result](const auto & item) { return item.first >= last_log_read_result->log_start_index; }); + entry_storage.eraseIf([last_log_read_result](const auto index) { return index >= last_log_read_result->log_start_index; }); } else if (last_log_read_result->error) { LOG_INFO(log, "Chagelog {} read finished with error but some logs were read from it, file will not be removed", description->path); remove_invalid_logs(); - std::erase_if(logs, [last_log_read_result](const auto & item) { return item.first > last_log_read_result->last_read_index; }); + entry_storage.eraseIf([last_log_read_result](const auto index) { return index > last_log_read_result->last_read_index; }); move_from_latest_logs_disks(existing_changelogs.at(last_log_read_result->log_start_index)); } /// don't mix compressed and uncompressed writes @@ -902,7 +1123,6 @@ void Changelog::readChangelogAndInitWriter(uint64_t last_commited_log_index, uin moveFileBetweenDisks(description->disk, description, disk, description->path); } - initialized = true; } @@ -1006,14 +1226,14 @@ void Changelog::removeAllLogsAfter(uint64_t remove_after_log_start_index) LOG_WARNING(log, "Removing changelogs that go after broken changelog entry"); removeExistingLogs(start_to_remove_from_itr, existing_changelogs.end()); - std::erase_if(logs, [start_to_remove_from_log_id](const auto & item) { return item.first >= start_to_remove_from_log_id; }); + entry_storage.eraseIf([start_to_remove_from_log_id](const auto index) { return index >= start_to_remove_from_log_id; }); } void Changelog::removeAllLogs() { LOG_WARNING(log, "Removing all changelogs"); removeExistingLogs(existing_changelogs.begin(), existing_changelogs.end()); - logs.clear(); + entry_storage.clear(); } ChangelogRecord Changelog::buildRecord(uint64_t index, const LogEntryPtr & log_entry) @@ -1157,10 +1377,10 @@ void Changelog::appendEntry(uint64_t index, const LogEntryPtr & log_entry) if (!initialized) throw Exception(ErrorCodes::LOGICAL_ERROR, "Changelog must be initialized before appending records"); - if (logs.empty()) + if (min_log_id == 0) min_log_id = index; - logs[index] = log_entry; + entry_storage.addEntry(index, log_entry); max_log_id = index; if (!write_operations.push(AppendLog{index, log_entry})) @@ -1207,7 +1427,7 @@ void Changelog::writeAt(uint64_t index, const LogEntryPtr & log_entry) /// Remove redundant logs from memory /// Everything >= index must be removed - std::erase_if(logs, [index](const auto & item) { return item.first >= index; }); + entry_storage.eraseIf([index](const auto current_index) { return current_index >= index; }); /// Now we can actually override entry at index appendEntry(index, log_entry); @@ -1276,7 +1496,8 @@ void Changelog::compact(uint64_t up_to_log_index) } /// Compaction from the past is possible, so don't make our min_log_id smaller. min_log_id = std::max(min_log_id, up_to_log_index + 1); - std::erase_if(logs, [up_to_log_index](const auto & item) { return item.first <= up_to_log_index; }); + + entry_storage.eraseIf([up_to_log_index](const auto index) { return index <= up_to_log_index; }); if (need_rotate) current_writer->rotate(up_to_log_index + 1); @@ -1289,46 +1510,26 @@ LogEntryPtr Changelog::getLastEntry() const /// This entry treaded in special way by NuRaft static LogEntryPtr fake_entry = nuraft::cs_new(0, nuraft::buffer::alloc(sizeof(uint64_t))); - auto entry = logs.find(max_log_id); - if (entry == logs.end()) - { + auto entry = entry_storage.getEntry(max_log_id); + if (entry == nullptr) return fake_entry; - } - return entry->second; + return entry; } LogEntriesPtr Changelog::getLogEntriesBetween(uint64_t start, uint64_t end) { - LogEntriesPtr ret = nuraft::cs_new>>(); - - ret->resize(end - start); - uint64_t result_pos = 0; - for (uint64_t i = start; i < end; ++i) - { - (*ret)[result_pos] = entryAt(i); - result_pos++; - } - return ret; + return entry_storage.getLogEntriesBetween(start, end); } LogEntryPtr Changelog::entryAt(uint64_t index) { - nuraft::ptr src = nullptr; - auto entry = logs.find(index); - if (entry == logs.end()) - return nullptr; - - src = entry->second; - return src; + return entry_storage.getEntry(index); } LogEntryPtr Changelog::getLatestConfigChange() const { - for (const auto & [_, entry] : logs) - if (entry->get_val_type() == nuraft::conf) - return entry; - return nullptr; + return entry_storage.getLatestConfigChange(); } nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, int32_t count) @@ -1339,11 +1540,11 @@ nuraft::ptr Changelog::serializeEntriesToBuffer(uint64_t index, uint64_t size_total = 0; for (uint64_t i = index; i < index + count; ++i) { - auto entry = logs.find(i); - if (entry == logs.end()) + auto entry = entry_storage.getEntry(i); + if (entry == nullptr) throw Exception(ErrorCodes::LOGICAL_ERROR, "Don't have log entry {}", i); - nuraft::ptr buf = entry->second->serialize(); + nuraft::ptr buf = entry->serialize(); size_total += buf->size(); returned_logs.push_back(std::move(buf)); } @@ -1374,7 +1575,7 @@ void Changelog::applyEntriesFromBuffer(uint64_t index, nuraft::buffer & buffer) buffer.get(buf_local); LogEntryPtr log_entry = nuraft::log_entry::deserialize(*buf_local); - if (i == 0 && logs.contains(cur_index)) + if (i == 0 && entry_storage.contains(cur_index)) writeAt(cur_index, log_entry); else appendEntry(cur_index, log_entry); @@ -1409,6 +1610,8 @@ std::shared_ptr Changelog::flushAsync() LOG_WARNING(log, "Changelog is shut down"); return nullptr; } + + entry_storage.refreshCache(); return failed; } diff --git a/src/Coordination/Changelog.h b/src/Coordination/Changelog.h index 20f850e3f62..ee212ef3a71 100644 --- a/src/Coordination/Changelog.h +++ b/src/Coordination/Changelog.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -23,7 +22,6 @@ using LogEntries = std::vector; using LogEntriesPtr = nuraft::ptr; using BufferPtr = nuraft::ptr; -using IndexToOffset = std::unordered_map; using IndexToLogEntry = std::unordered_map; enum class ChangelogVersion : uint8_t @@ -63,6 +61,8 @@ struct ChangelogFileDescription DiskPtr disk; std::string path; + std::mutex file_mutex; + bool deleted = false; /// How many entries should be stored in this log @@ -87,6 +87,43 @@ struct FlushSettings uint64_t max_flush_batch_size = 1000; }; +struct LogLocation +{ + ChangelogFileDescriptionPtr file_description; + size_t position; +}; + +struct LogEntryStorage +{ + size_t size() const; + + void addEntry(uint64_t index, const LogEntryPtr & log_entry); + void addEntryWithLocation(uint64_t index, const LogEntryPtr & log_entry, LogLocation log_location); + void eraseIf(std::function index_predicate); + bool contains(uint64_t index) const; + LogEntryPtr getEntry(uint64_t index) const; + void clear(); + LogEntryPtr getLatestConfigChange() const; + + using IndexWithLogLocation = std::pair; + + void addLogLocations(std::vector indices_with_log_locations); + + void refreshCache(); + + LogEntriesPtr getLogEntriesBetween(uint64_t start, uint64_t end) const; +private: + /// Mapping log_id -> log_entry + IndexToLogEntry logs_cache; + size_t min_index_in_cache = 0; + + size_t total_entries = 0; + mutable std::mutex logs_location_mutex; + std::vector unapplied_indices_with_log_locations; + std::unordered_map logs_location; + size_t max_index_with_location = 0; +}; + /// Simplest changelog with files rotation. /// No compression, no metadata, just entries with headers one by one. /// Able to read broken files/entries and discard them. Not thread safe. @@ -143,7 +180,7 @@ public: void shutdown(); - uint64_t size() const { return logs.size(); } + uint64_t size() const { return entry_storage.size(); } uint64_t lastDurableIndex() const { @@ -190,8 +227,9 @@ private: std::mutex writer_mutex; /// Current writer for changelog file std::unique_ptr current_writer; - /// Mapping log_id -> log_entry - IndexToLogEntry logs; + + LogEntryStorage entry_storage; + /// Start log_id which exists in all "active" logs /// min_log_id + 1 == max_log_id means empty log storage for NuRaft uint64_t min_log_id = 0; From 09f1e2840c2859a517c9f76183a7abd51c488b6f Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Thu, 25 Jan 2024 10:06:05 +0100 Subject: [PATCH 141/884] Simplified calculatePartSize and upload task --- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 2 - .../AzureBlobStorage/AzureObjectStorage.h | 4 - .../copyAzureBlobStorageFile.cpp | 114 +++++------------- 3 files changed, 27 insertions(+), 93 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index cbc2996f5c1..02b0d5bb599 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -165,9 +165,7 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getInt(config_prefix + ".max_single_read_retries", 3), config.getInt(config_prefix + ".max_single_download_retries", 3), config.getInt(config_prefix + ".list_object_keys_size", 1000), - config.getUInt64(config_prefix + ".min_upload_part_size", 16 * 1024 * 1024), config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), - config.getUInt64(config_prefix + ".max_part_number", 10000), config.getBool(config_prefix + ".use_native_copy", false) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 3be4989d4f2..30fedb601dc 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -24,18 +24,14 @@ struct AzureObjectStorageSettings int max_single_read_retries_, int max_single_download_retries_, int list_object_keys_size_, - size_t min_upload_part_size_, size_t max_upload_part_size_, - size_t max_part_number_, bool use_native_copy_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) , max_single_read_retries(max_single_read_retries_) , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) - , min_upload_part_size(min_upload_part_size_) , max_upload_part_size(max_upload_part_size_) - , max_part_number(max_part_number_) , use_native_copy(use_native_copy_) { } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 350d2d1d34e..e5517a1a021 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -84,17 +84,10 @@ namespace struct UploadPartTask { - char *data = nullptr; - size_t size = 0; - std::string block_id; + std::unique_ptr read_buffer = nullptr; + std::vector block_ids; bool is_finished = false; std::exception_ptr exception; - - ~UploadPartTask() - { - if (data != nullptr) - free(data); - } }; size_t normal_part_size; @@ -108,56 +101,11 @@ namespace void calculatePartSize() { - if (!total_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Chosen multipart upload for an empty file. This must not happen"); - - auto max_part_number = settings.get()->max_part_number; - auto min_upload_part_size = settings.get()->min_upload_part_size; auto max_upload_part_size = settings.get()->max_upload_part_size; - - if (!max_part_number) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_part_number must not be 0"); - else if (!min_upload_part_size) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "min_upload_part_size must not be 0"); - else if (max_upload_part_size < min_upload_part_size) - throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be less than min_upload_part_size"); - - size_t part_size = min_upload_part_size; - size_t num_parts = (total_size + part_size - 1) / part_size; - - if (num_parts > max_part_number) - { - part_size = (total_size + max_part_number - 1) / max_part_number; - num_parts = (total_size + part_size - 1) / part_size; - } - - if (part_size > max_upload_part_size) - { - part_size = max_upload_part_size; - num_parts = (total_size + part_size - 1) / part_size; - } - - if (num_parts < 1 || num_parts > max_part_number || part_size < min_upload_part_size || part_size > max_upload_part_size) - { - String msg; - if (num_parts < 1) - msg = "Number of parts is zero"; - else if (num_parts > max_part_number) - msg = fmt::format("Number of parts exceeds {}", num_parts, max_part_number); - else if (part_size < min_upload_part_size) - msg = fmt::format("Size of a part is less than {}", part_size, min_upload_part_size); - else - msg = fmt::format("Size of a part exceeds {}", part_size, max_upload_part_size); - - throw Exception( - ErrorCodes::INVALID_CONFIG_PARAMETER, - "{} while writing {} bytes to AzureBlobStorage. Check max_part_number = {}, " - "min_upload_part_size = {}, max_upload_part_size = {}", - msg, total_size, max_part_number, min_upload_part_size, max_upload_part_size); - } - + if (!max_upload_part_size) + throw Exception(ErrorCodes::INVALID_CONFIG_PARAMETER, "max_upload_part_size must not be 0"); /// We've calculated the size of a normal part (the final part can be smaller). - normal_part_size = part_size; + normal_part_size = max_upload_part_size; } public: @@ -238,18 +186,13 @@ namespace try { - auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); - task->data = new char[part_size]; - task->size = part_size; - size_t n = read_buffer->read(task->data,part_size); - if (n != part_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size"); + task->read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); schedule([this, task, task_finish_notify]() { try { - processUploadTask(*task); + processUploadPartRequest(*task); } catch (...) { @@ -267,38 +210,35 @@ namespace else { UploadPartTask task; - auto read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); - task.data = new char[part_size]; - size_t n = read_buffer->read(task.data,part_size); - if (n != part_size) - throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected size"); - task.size = part_size; - processUploadTask(task); - block_ids.emplace_back(task.block_id); + task.read_buffer = std::make_unique(create_read_buffer(), part_offset, part_size); + processUploadPartRequest(task); + block_ids.insert(block_ids.end(),task.block_ids.begin(), task.block_ids.end()); } } - void processUploadTask(UploadPartTask & task) - { - auto block_id = processUploadPartRequest(task); - - std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race - task.block_id = block_id; - LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, block_id: {}, Parts: {}", dest_container_for_logging, dest_blob, block_id, bg_tasks.size()); - } - - String processUploadPartRequest(UploadPartTask & task) + void processUploadPartRequest(UploadPartTask & task) { ProfileEvents::increment(ProfileEvents::AzureUploadPart); if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); - task.block_id = getRandomASCIIString(64); - Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.data), task.size); - block_blob_client.StageBlock(task.block_id, memory); - return task.block_id; + while (!task.read_buffer->eof()) + { + auto size = task.read_buffer->available(); + if (size > 0) + { + auto block_id = getRandomASCIIString(64); + Azure::Core::IO::MemoryBodyStream memory(reinterpret_cast(task.read_buffer->position()), size); + block_blob_client.StageBlock(block_id, memory); + task.block_ids.emplace_back(block_id); + task.read_buffer->ignore(size); + LOG_TRACE(log, "Writing part. Container: {}, Blob: {}, block_id: {}", dest_container_for_logging, dest_blob, block_id); + } + } + std::lock_guard lock(bg_tasks_mutex); /// Protect bg_tasks from race + LOG_TRACE(log, "Writing part finished. Container: {}, Blob: {}, Parts: {}", dest_container_for_logging, dest_blob, bg_tasks.size()); } @@ -316,7 +256,7 @@ namespace { if (task.exception) std::rethrow_exception(task.exception); - block_ids.emplace_back(task.block_id); + block_ids.insert(block_ids.end(),task.block_ids.begin(), task.block_ids.end()); } } }; From 288d288b8766f75670c82f9f4f190591f2ba7332 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 25 Jan 2024 19:57:51 +0200 Subject: [PATCH 142/884] fix failing 00937_template_output_format --- src/Core/Settings.h | 2 +- src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp | 8 ++++---- .../0_stateless/00937_format_schema_rows_template.sh | 3 +-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index d0a327e2d44..f9e3f401d98 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1080,7 +1080,7 @@ class IColumn; M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ - M(String, format_schema_rows_template, "\n", "Format string for rows (for Template format)", 0) \ + M(String, format_schema_rows_template, "", "Format string for rows (for Template format)", 0) \ M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index 99a7f59c09e..efda754917b 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -226,14 +226,14 @@ void registerOutputFormatTemplate(FormatFactory & factory) } else { - if (!settings.template_settings.row_format_schema.empty()) - { - throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template, but not both"); - } row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), idx_by_name); + if (!settings.template_settings.row_format_schema.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template, but not both"); + } } return std::make_shared(sample, buf, settings, resultset_format, row_format, settings.template_settings.row_between_delimiter); }); diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh index 8b512513d94..6161f71e78e 100755 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.sh +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -12,7 +12,7 @@ $CLICKHOUSE_CLIENT --query="CREATE TABLE template (question String, answer Strin $CLICKHOUSE_CLIENT --query="INSERT INTO template VALUES ('How awesome is clickhouse?', 'unbelievably awesome!', 456, '2016-01-02'),\ ('How fast is clickhouse?', 'Lightning fast!', 9876543210, '2016-01-03'),\ -('Is it opensource', 'of course it is!', 789, '2016-01-04')"; +('Is it opensource?', 'of course it is!', 789, '2016-01-04')"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ @@ -21,7 +21,6 @@ format_template_rows_between_delimiter = ';\n'"; echo -e "\n" # Test that if both format_schema_rows_template setting and format_template_row are provided, error is thrown - echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > "$CURDIR"/00937_template_output_format_row.tmp $CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ From a74c78c4f0cb0ff30014afe2edd78efca3450f49 Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 25 Jan 2024 20:56:23 +0200 Subject: [PATCH 143/884] fix failing test 00937_format_schema_rows_template.sh --- .../queries/0_stateless/00937_format_schema_rows_template.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh index 6161f71e78e..aff5de3b555 100755 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.sh +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -22,10 +22,10 @@ echo -e "\n" # Test that if both format_schema_rows_template setting and format_template_row are provided, error is thrown echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > "$CURDIR"/00937_template_output_format_row.tmp -$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +$CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ -format_template_rows_between_delimiter = ';\n'; --{ serverError 474 }" +format_template_rows_between_delimiter = ';\n'; --{clientError 474}" $CLICKHOUSE_CLIENT --query="DROP TABLE template"; rm "$CURDIR"/00937_template_output_format_row.tmp From 64d18ad8e706e859e1ddd9398ec0a96ae088e07c Mon Sep 17 00:00:00 2001 From: Blargian Date: Thu, 25 Jan 2024 22:04:21 +0200 Subject: [PATCH 144/884] CI trigger --- .../0_stateless/00937_format_schema_rows_template.reference | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.reference b/tests/queries/0_stateless/00937_format_schema_rows_template.reference index 167f16ec55f..5f59cca2629 100644 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.reference +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.reference @@ -1,4 +1,4 @@ Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number of Likes: 456, Date: 2016-01-02; Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; -Question: 'Is it opensource', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 +Question: 'Is it opensource?', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 From 36055bd0089f52473f893d71c475a2782a45e8b4 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Thu, 25 Jan 2024 21:44:46 +0000 Subject: [PATCH 145/884] init --- src/Functions/FunctionBinaryArithmetic.h | 106 ++++++++++++++---- src/Functions/IsOperation.h | 4 +- .../02975_intdiv_with_decimal.reference | 52 +++++++++ .../0_stateless/02975_intdiv_with_decimal.sql | 54 +++++++++ 4 files changed, 196 insertions(+), 20 deletions(-) create mode 100644 tests/queries/0_stateless/02975_intdiv_with_decimal.reference create mode 100644 tests/queries/0_stateless/02975_intdiv_with_decimal.sql diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 1b2519d1ec5..e34514d15fd 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -146,10 +146,24 @@ private: /// it's not correct for Decimal public: static constexpr bool allow_decimal = IsOperation::allow_decimal; + static constexpr bool only_integer = IsOperation::div_int || IsOperation::div_int_or_zero; /// Appropriate result type for binary operator on numeric types. "Date" can also mean /// DateTime, but if both operands are Dates, their type must be the same (e.g. Date - DateTime is invalid). using ResultDataType = Switch< + /// Result must be Integer + Case< + only_integer && IsDataTypeDecimal && IsDataTypeDecimal, + Switch< + Case || std::is_same_v, DataTypeInt256>, + Case || std::is_same_v, DataTypeInt128>, + Case || std::is_same_v, DataTypeInt64>, + Case || std::is_same_v, DataTypeInt32>>>, + Case< + only_integer, + Switch< + Case, LeftDataType>, + Case, RightDataType>>>, /// Decimal cases Case || IsDataTypeDecimal), InvalidType>, Case< @@ -1667,31 +1681,77 @@ public: { if constexpr (IsDataTypeDecimal && IsDataTypeDecimal) { - if constexpr (is_division) + if constexpr (is_div_int || is_div_int_or_zero) { - if (context->getSettingsRef().decimal_check_overflow) - { - /// Check overflow by using operands scale (based on big decimal division implementation details): - /// big decimal arithmetic is based on big integers, decimal operands are converted to big integers - /// i.e. int_operand = decimal_operand*10^scale - /// For division, left operand will be scaled by right operand scale also to do big integer division, - /// BigInt result = left*10^(left_scale + right_scale) / right * 10^right_scale - /// So, we can check upfront possible overflow just by checking max scale used for left operand - /// Note: it doesn't detect all possible overflow during big decimal division - if (left.getScale() + right.getScale() > ResultDataType::maxPrecision()) - throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Overflow during decimal division"); - } + if constexpr (std::is_same_v || std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v || std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v || std::is_same_v) + type_res = std::make_shared(); + else + type_res = std::make_shared(); + } + else + { + if constexpr (is_division) + { + if (context->getSettingsRef().decimal_check_overflow) + { + /// Check overflow by using operands scale (based on big decimal division implementation details): + /// big decimal arithmetic is based on big integers, decimal operands are converted to big integers + /// i.e. int_operand = decimal_operand*10^scale + /// For division, left operand will be scaled by right operand scale also to do big integer division, + /// BigInt result = left*10^(left_scale + right_scale) / right * 10^right_scale + /// So, we can check upfront possible overflow just by checking max scale used for left operand + /// Note: it doesn't detect all possible overflow during big decimal division + if (left.getScale() + right.getScale() > ResultDataType::maxPrecision()) + throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Overflow during decimal division"); + } + } + ResultDataType result_type = decimalResultType(left, right); + type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); } - ResultDataType result_type = decimalResultType(left, right); - type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); } else if constexpr ((IsDataTypeDecimal && IsFloatingPoint) || (IsDataTypeDecimal && IsFloatingPoint)) type_res = std::make_shared(); else if constexpr (IsDataTypeDecimal) - type_res = std::make_shared(left.getPrecision(), left.getScale()); + { + if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegral) + type_res = std::make_shared(); + else if constexpr (is_div_int || is_div_int_or_zero) + { + if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v) + type_res = std::make_shared(); + else + type_res = std::make_shared(); + } + else + type_res = std::make_shared(left.getPrecision(), left.getScale()); + } else if constexpr (IsDataTypeDecimal) - type_res = std::make_shared(right.getPrecision(), right.getScale()); + { + if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegral) + type_res = std::make_shared(); + else if constexpr (is_div_int || is_div_int_or_zero) + { + if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v) + type_res = std::make_shared(); + else + type_res = std::make_shared(); + } + else + type_res = std::make_shared(right.getPrecision(), right.getScale()); + } else if constexpr (std::is_same_v) { // Special case for DateTime: binary OPS should reuse timezone @@ -2009,8 +2069,10 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A constexpr bool decimal_with_float = (IsDataTypeDecimal && IsFloatingPoint) || (IsFloatingPoint && IsDataTypeDecimal); - using T0 = std::conditional_t; - using T1 = std::conditional_t; + constexpr bool is_div_int_with_decimal = (is_div_int || is_div_int_or_zero) && (IsDataTypeDecimal || IsDataTypeDecimal); + + using T0 = std::conditional_t>; + using T1 = std::conditional_t>; using ResultType = typename ResultDataType::FieldType; using ColVecT0 = ColumnVectorOrDecimal; using ColVecT1 = ColumnVectorOrDecimal; @@ -2026,6 +2088,12 @@ ColumnPtr executeStringInteger(const ColumnsWithTypeAndName & arguments, const A left_col = castColumn(arguments[0], converted_type); right_col = castColumn(arguments[1], converted_type); } + else if constexpr (is_div_int_with_decimal) + { + const auto converted_type = std::make_shared(); + left_col = castColumn(arguments[0], converted_type); + right_col = castColumn(arguments[1], converted_type); + } else { left_col = arguments[0].column; diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h index 8ea53c865ce..b36530591ef 100644 --- a/src/Functions/IsOperation.h +++ b/src/Functions/IsOperation.h @@ -62,7 +62,9 @@ struct IsOperation static constexpr bool division = div_floating || div_int || div_int_or_zero || modulo; - static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest; + static constexpr bool division_allow_decimal = div_floating || modulo; + + static constexpr bool allow_decimal = plus || minus || multiply || division_allow_decimal || least || greatest; }; } diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.reference b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference new file mode 100644 index 00000000000..9c1faab21d7 --- /dev/null +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference @@ -0,0 +1,52 @@ +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.sql b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql new file mode 100644 index 00000000000..8fc4b5a9a7d --- /dev/null +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql @@ -0,0 +1,54 @@ +--intDiv-- +SELECT intDiv(4,2); +SELECT intDiv(toDecimal32(4.4, 2), 2); +SELECT intDiv(4, toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), 2); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 3), 2); +SELECT intDiv(toDecimal64(4.4, 3), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 4), 2); +SELECT intDiv(toDecimal128(4.4, 4), toDecimal32(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 5), 2); +SELECT intDiv(toDecimal256(4.4, 5), toDecimal32(2.2, 2)); +SELECT intDiv(4, toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(4, toDecimal128(2.2, 3)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal128(2.2, 3)); +SELECT intDiv(4, toDecimal256(2.2, 4)); +SELECT intDiv(toDecimal32(4.4, 2), toDecimal256(2.2, 4)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDiv(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +--intDivOrZero-- +SELECT intDivOrZero(4,2); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2); +SELECT intDivOrZero(4, toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 3), 2); +SELECT intDivOrZero(toDecimal64(4.4, 3), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 4), 2); +SELECT intDivOrZero(toDecimal128(4.4, 4), toDecimal32(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 5), 2); +SELECT intDivOrZero(toDecimal256(4.4, 5), toDecimal32(2.2, 2)); +SELECT intDivOrZero(4, toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(4, toDecimal128(2.2, 3)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal128(2.2, 3)); +SELECT intDivOrZero(4, toDecimal256(2.2, 4)); +SELECT intDivOrZero(toDecimal32(4.4, 2), toDecimal256(2.2, 4)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal64(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); +SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); From 918614970b8bbf0938e2ee5769d9891ec6cfecbf Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 Jan 2024 02:02:03 +0100 Subject: [PATCH 146/884] Fix style --- tests/integration/test_storage_hdfs/test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py index 165dfb212b7..121263fb622 100644 --- a/tests/integration/test_storage_hdfs/test.py +++ b/tests/integration/test_storage_hdfs/test.py @@ -599,9 +599,7 @@ def test_schema_inference_with_globs(started_cluster): f"desc hdfs('hdfs://hdfs1:9000/data*.jsoncompacteachrow') settings schema_inference_use_cache_for_hdfs=0, input_format_json_infer_incomplete_types_as_strings=0" ) - assert ( - "CANNOT_EXTRACT_TABLE_STRUCTURE" in result - ) + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result def test_insert_select_schema_inference(started_cluster): From 13c86248719eabd22856e5af7161f2f7547fdd8e Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 Jan 2024 02:03:12 +0100 Subject: [PATCH 147/884] Fix style --- tests/integration/test_storage_s3/test.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tests/integration/test_storage_s3/test.py b/tests/integration/test_storage_s3/test.py index 0b5e9462860..dbbe670e8ca 100644 --- a/tests/integration/test_storage_s3/test.py +++ b/tests/integration/test_storage_s3/test.py @@ -1379,9 +1379,7 @@ def test_schema_inference_from_globs(started_cluster): f"desc s3('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/test*.jsoncompacteachrow') settings schema_inference_use_cache_for_s3=0, input_format_json_infer_incomplete_types_as_strings=0" ) - assert ( - "CANNOT_EXTRACT_TABLE_STRUCTURE" in result - ) + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result url_filename = "test{0,1,2,3}.jsoncompacteachrow" @@ -1389,9 +1387,7 @@ def test_schema_inference_from_globs(started_cluster): f"desc url('http://{started_cluster.minio_host}:{started_cluster.minio_port}/{bucket}/{url_filename}') settings schema_inference_use_cache_for_url=0, input_format_json_infer_incomplete_types_as_strings=0" ) - assert ( - "CANNOT_EXTRACT_TABLE_STRUCTURE" in result - ) + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in result def test_signatures(started_cluster): From 692d37306eda131578a3c9df77c9c2c7e46fb231 Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Fri, 26 Jan 2024 15:50:16 +0100 Subject: [PATCH 148/884] Fix build --- src/Storages/StorageURLCluster.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Storages/StorageURLCluster.cpp b/src/Storages/StorageURLCluster.cpp index 7b38048b384..d0df74d7521 100644 --- a/src/Storages/StorageURLCluster.cpp +++ b/src/Storages/StorageURLCluster.cpp @@ -44,7 +44,7 @@ StorageURLCluster::StorageURLCluster( const ColumnsDescription & columns_, const ConstraintsDescription & constraints_, const StorageURL::Configuration & configuration_) - : IStorageCluster(cluster_name_, table_id_, getLogger("StorageURLCluster (" + table_id_.table_name + ")"), structure_argument_was_provided_) + : IStorageCluster(cluster_name_, table_id_, getLogger("StorageURLCluster (" + table_id_.table_name + ")")) , uri(uri_), format_name(format_) { context->getRemoteHostFilter().checkURL(Poco::URI(uri)); From bade45d197884812886fd6eedd85883f5d67fa50 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Sat, 27 Jan 2024 21:30:49 +0100 Subject: [PATCH 149/884] impl --- contrib/aws | 2 +- contrib/aws-c-auth | 2 +- contrib/aws-c-cal | 2 +- contrib/aws-c-common | 2 +- contrib/aws-c-compression | 2 +- contrib/aws-c-event-stream | 2 +- contrib/aws-c-http | 2 +- contrib/aws-c-io | 2 +- contrib/aws-c-mqtt | 2 +- contrib/aws-c-s3 | 2 +- contrib/aws-c-sdkutils | 2 +- contrib/aws-checksums | 2 +- contrib/aws-cmake/CMakeLists.txt | 11 ++++++++++- contrib/aws-crt-cpp | 2 +- contrib/aws-s2n-tls | 2 +- 15 files changed, 24 insertions(+), 15 deletions(-) diff --git a/contrib/aws b/contrib/aws index ca02358dcc7..4ec215f3607 160000 --- a/contrib/aws +++ b/contrib/aws @@ -1 +1 @@ -Subproject commit ca02358dcc7ce3ab733dd4cbcc32734eecfa4ee3 +Subproject commit 4ec215f3607c2111bf2cc91ba842046a6b5eb0c4 diff --git a/contrib/aws-c-auth b/contrib/aws-c-auth index 97133a2b5db..baeffa791d9 160000 --- a/contrib/aws-c-auth +++ b/contrib/aws-c-auth @@ -1 +1 @@ -Subproject commit 97133a2b5dbca1ccdf88cd6f44f39d0531d27d12 +Subproject commit baeffa791d9d1cf61460662a6d9ac2186aaf05df diff --git a/contrib/aws-c-cal b/contrib/aws-c-cal index 85dd7664b78..9453687ff54 160000 --- a/contrib/aws-c-cal +++ b/contrib/aws-c-cal @@ -1 +1 @@ -Subproject commit 85dd7664b786a389c6fb1a6f031ab4bb2282133d +Subproject commit 9453687ff5493ba94eaccf8851200565c4364c77 diff --git a/contrib/aws-c-common b/contrib/aws-c-common index 45dcb2849c8..80f21b3cac5 160000 --- a/contrib/aws-c-common +++ b/contrib/aws-c-common @@ -1 +1 @@ -Subproject commit 45dcb2849c891dba2100b270b4676765c92949ff +Subproject commit 80f21b3cac5ac51c6b8a62c7d2a5ef58a75195ee diff --git a/contrib/aws-c-compression b/contrib/aws-c-compression index b517b7decd0..99ec79ee297 160000 --- a/contrib/aws-c-compression +++ b/contrib/aws-c-compression @@ -1 +1 @@ -Subproject commit b517b7decd0dac30be2162f5186c250221c53aff +Subproject commit 99ec79ee2970f1a045d4ced1501b97ee521f2f85 diff --git a/contrib/aws-c-event-stream b/contrib/aws-c-event-stream index 2f9b60c42f9..08f24e384e5 160000 --- a/contrib/aws-c-event-stream +++ b/contrib/aws-c-event-stream @@ -1 +1 @@ -Subproject commit 2f9b60c42f90840ec11822acda3d8cdfa97a773d +Subproject commit 08f24e384e5be20bcffa42b49213d24dad7881ae diff --git a/contrib/aws-c-http b/contrib/aws-c-http index dd344619879..a082f8a2067 160000 --- a/contrib/aws-c-http +++ b/contrib/aws-c-http @@ -1 +1 @@ -Subproject commit dd34461987947672444d0bc872c5a733dfdb9711 +Subproject commit a082f8a2067e4a31db73f1d4ffd702a8dc0f7089 diff --git a/contrib/aws-c-io b/contrib/aws-c-io index d58ed4f272b..11ce3c750a1 160000 --- a/contrib/aws-c-io +++ b/contrib/aws-c-io @@ -1 +1 @@ -Subproject commit d58ed4f272b1cb4f89ac9196526ceebe5f2b0d89 +Subproject commit 11ce3c750a1dac7b04069fc5bff89e97e91bad4d diff --git a/contrib/aws-c-mqtt b/contrib/aws-c-mqtt index 33c3455cec8..6d36cd37262 160000 --- a/contrib/aws-c-mqtt +++ b/contrib/aws-c-mqtt @@ -1 +1 @@ -Subproject commit 33c3455cec82b16feb940e12006cefd7b3ef4194 +Subproject commit 6d36cd3726233cb757468d0ea26f6cd8dad151ec diff --git a/contrib/aws-c-s3 b/contrib/aws-c-s3 index d7bfe602d69..de36fee8fe7 160000 --- a/contrib/aws-c-s3 +++ b/contrib/aws-c-s3 @@ -1 +1 @@ -Subproject commit d7bfe602d6925948f1fff95784e3613cca6a3900 +Subproject commit de36fee8fe7ab02f10987877ae94a805bf440c1f diff --git a/contrib/aws-c-sdkutils b/contrib/aws-c-sdkutils index 208a701fa01..fd8c0ba2e23 160000 --- a/contrib/aws-c-sdkutils +++ b/contrib/aws-c-sdkutils @@ -1 +1 @@ -Subproject commit 208a701fa01e99c7c8cc3dcebc8317da71362972 +Subproject commit fd8c0ba2e233997eaaefe82fb818b8b444b956d3 diff --git a/contrib/aws-checksums b/contrib/aws-checksums index ad53be196a2..321b805559c 160000 --- a/contrib/aws-checksums +++ b/contrib/aws-checksums @@ -1 +1 @@ -Subproject commit ad53be196a25bbefa3700a01187fdce573a7d2d0 +Subproject commit 321b805559c8e911be5bddba13fcbd222a3e2d3a diff --git a/contrib/aws-cmake/CMakeLists.txt b/contrib/aws-cmake/CMakeLists.txt index 950a0e06cd0..b913908911c 100644 --- a/contrib/aws-cmake/CMakeLists.txt +++ b/contrib/aws-cmake/CMakeLists.txt @@ -35,6 +35,8 @@ set(AWS_PUBLIC_COMPILE_DEFS) set(AWS_PRIVATE_COMPILE_DEFS) set(AWS_PRIVATE_LIBS) +list(APPEND AWS_PRIVATE_COMPILE_DEFS "-DINTEL_NO_ITTNOTIFY_API") + if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG") list(APPEND AWS_PRIVATE_COMPILE_DEFS "-DDEBUG_BUILD") endif() @@ -85,14 +87,20 @@ file(GLOB AWS_SDK_CORE_SRC "${AWS_SDK_CORE_DIR}/source/external/cjson/*.cpp" "${AWS_SDK_CORE_DIR}/source/external/tinyxml2/*.cpp" "${AWS_SDK_CORE_DIR}/source/http/*.cpp" + "${AWS_SDK_CORE_DIR}/source/http/crt/*.cpp" "${AWS_SDK_CORE_DIR}/source/http/standard/*.cpp" "${AWS_SDK_CORE_DIR}/source/internal/*.cpp" "${AWS_SDK_CORE_DIR}/source/monitoring/*.cpp" + "${AWS_SDK_CORE_DIR}/source/net/*.cpp" + "${AWS_SDK_CORE_DIR}/source/net/linux-shared/*.cpp" + "${AWS_SDK_CORE_DIR}/source/platform/linux-shared/*.cpp" + "${AWS_SDK_CORE_DIR}/source/smithy/tracing/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/base64/*.cpp" + "${AWS_SDK_CORE_DIR}/source/utils/component-registry/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/crypto/*.cpp" - "${AWS_SDK_CORE_DIR}/source/utils/crypto/openssl/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/crypto/factory/*.cpp" + "${AWS_SDK_CORE_DIR}/source/utils/crypto/openssl/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/event/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/json/*.cpp" "${AWS_SDK_CORE_DIR}/source/utils/logging/*.cpp" @@ -176,6 +184,7 @@ file(GLOB AWS_COMMON_SRC "${AWS_COMMON_DIR}/source/*.c" "${AWS_COMMON_DIR}/source/external/*.c" "${AWS_COMMON_DIR}/source/posix/*.c" + "${AWS_COMMON_DIR}/source/linux/*.c" ) file(GLOB AWS_COMMON_ARCH_SRC diff --git a/contrib/aws-crt-cpp b/contrib/aws-crt-cpp index 8a301b7e842..86adce22528 160000 --- a/contrib/aws-crt-cpp +++ b/contrib/aws-crt-cpp @@ -1 +1 @@ -Subproject commit 8a301b7e842f1daed478090c869207300972379f +Subproject commit 86adce22528b811efa5ca27f65d8d5a38223cbfa diff --git a/contrib/aws-s2n-tls b/contrib/aws-s2n-tls index 71f4794b758..9a1e7545402 160000 --- a/contrib/aws-s2n-tls +++ b/contrib/aws-s2n-tls @@ -1 +1 @@ -Subproject commit 71f4794b7580cf780eb4aca77d69eded5d3c7bb4 +Subproject commit 9a1e75454023e952b366ce1eab9c54007250119f From 37823a7b91a7bafc05289ec9e4ed1f4448146219 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Sat, 27 Jan 2024 23:45:14 +0100 Subject: [PATCH 150/884] use upstream repo --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 68016bf8c5b..a618104f364 100644 --- a/.gitmodules +++ b/.gitmodules @@ -99,7 +99,7 @@ url = https://github.com/awslabs/aws-c-event-stream [submodule "aws-c-common"] path = contrib/aws-c-common - url = https://github.com/ClickHouse/aws-c-common + url = https://github.com/awslabs/aws-c-common.git [submodule "aws-checksums"] path = contrib/aws-checksums url = https://github.com/awslabs/aws-checksums From d264a5a148c577ab046dc4bbef50b5a4e0c32db9 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Sun, 28 Jan 2024 12:06:52 +0100 Subject: [PATCH 151/884] Update client & settings to std::shared_ptr --- src/Backups/BackupIO_AzureBlobStorage.cpp | 6 ++-- src/Backups/BackupIO_AzureBlobStorage.h | 8 +++--- .../AzureBlobStorage/AzureObjectStorage.h | 6 ++-- .../Cached/CachedObjectStorage.h | 2 +- src/Disks/ObjectStorages/IObjectStorage.h | 2 +- .../copyAzureBlobStorageFile.cpp | 28 +++++++++---------- .../copyAzureBlobStorageFile.h | 10 +++---- 7 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/Backups/BackupIO_AzureBlobStorage.cpp b/src/Backups/BackupIO_AzureBlobStorage.cpp index 1b4c10ad0cb..d99f296cca1 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.cpp +++ b/src/Backups/BackupIO_AzureBlobStorage.cpp @@ -89,7 +89,7 @@ std::unique_ptr BackupReaderAzureBlobStorage::readFile(const key = file_name; } return std::make_unique( - client.get(), key, read_settings, settings.get()->max_single_read_retries, + client, key, read_settings, settings.get()->max_single_read_retries, settings.get()->max_single_download_retries); } @@ -262,7 +262,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::readFile(const String } return std::make_unique( - client.get(), key, read_settings, settings.get()->max_single_read_retries, + client, key, read_settings, settings.get()->max_single_read_retries, settings.get()->max_single_download_retries); } @@ -278,7 +278,7 @@ std::unique_ptr BackupWriterAzureBlobStorage::writeFile(const Strin key = file_name; } return std::make_unique( - client.get(), + client, key, settings.get()->max_single_part_upload_size, DBMS_DEFAULT_BUFFER_SIZE, diff --git a/src/Backups/BackupIO_AzureBlobStorage.h b/src/Backups/BackupIO_AzureBlobStorage.h index 87dc470cdb3..95325044a62 100644 --- a/src/Backups/BackupIO_AzureBlobStorage.h +++ b/src/Backups/BackupIO_AzureBlobStorage.h @@ -28,10 +28,10 @@ public: private: const DataSourceDescription data_source_description; - MultiVersion client; + std::shared_ptr client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; - MultiVersion settings; + std::shared_ptr settings; }; class BackupWriterAzureBlobStorage : public BackupWriterDefault @@ -57,10 +57,10 @@ private: std::unique_ptr readFile(const String & file_name, size_t expected_file_size) override; void removeFilesBatch(const Strings & file_names); const DataSourceDescription data_source_description; - MultiVersion client; + std::shared_ptr client; StorageAzureBlob::Configuration configuration; std::unique_ptr object_storage; - MultiVersion settings; + std::shared_ptr settings; }; } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 30fedb601dc..0ae12fb205f 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -137,11 +137,11 @@ public: bool isRemote() const override { return true; } - MultiVersion & getSettings() { return settings; } + std::shared_ptr getSettings() { return settings.get(); } - MultiVersion & getAzureBlobStorageClient() override + std::shared_ptr getAzureBlobStorageClient() override { - return client; + return client.get(); } private: diff --git a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h index 2ed8990515f..1f293e5857e 100644 --- a/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h +++ b/src/Disks/ObjectStorages/Cached/CachedObjectStorage.h @@ -122,7 +122,7 @@ public: static bool canUseReadThroughCache(const ReadSettings & settings); #if USE_AZURE_BLOB_STORAGE - MultiVersion & getAzureBlobStorageClient() override + std::shared_ptr getAzureBlobStorageClient() override { return object_storage->getAzureBlobStorageClient(); } diff --git a/src/Disks/ObjectStorages/IObjectStorage.h b/src/Disks/ObjectStorages/IObjectStorage.h index e066beaefcc..049935ad60c 100644 --- a/src/Disks/ObjectStorages/IObjectStorage.h +++ b/src/Disks/ObjectStorages/IObjectStorage.h @@ -226,7 +226,7 @@ public: virtual WriteSettings patchSettings(const WriteSettings & write_settings) const; #if USE_AZURE_BLOB_STORAGE - virtual MultiVersion & getAzureBlobStorageClient() + virtual std::shared_ptr getAzureBlobStorageClient() { throw Exception(ErrorCodes::NOT_IMPLEMENTED, "This function is only implemented for AzureBlobStorage"); } diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index e5517a1a021..537a5a191e7 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -44,12 +44,12 @@ namespace public: UploadHelper( const CreateReadBuffer & create_read_buffer_, - MultiVersion & client_, + std::shared_ptr client_, size_t offset_, size_t total_size_, const String & dest_container_for_logging_, const String & dest_blob_, - MultiVersion settings_, + std::shared_ptr settings_, ThreadPoolCallbackRunner schedule_, bool for_disk_azure_blob_storage_, const Poco::Logger * log_) @@ -71,12 +71,12 @@ namespace protected: std::function()> create_read_buffer; - MultiVersion & client; + std::shared_ptr client; size_t offset; size_t total_size; const String & dest_container_for_logging; const String & dest_blob; - MultiVersion settings; + std::shared_ptr settings; ThreadPoolCallbackRunner schedule; bool for_disk_azure_blob_storage; const Poco::Logger * log; @@ -116,7 +116,7 @@ namespace void completeMultipartUpload() { - auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); + auto block_blob_client = client->GetBlockBlobClient(dest_blob); block_blob_client.CommitBlockList(block_ids); } @@ -222,7 +222,7 @@ namespace if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureUploadPart); - auto block_blob_client = client.get()->GetBlockBlobClient(dest_blob); + auto block_blob_client = client->GetBlockBlobClient(dest_blob); while (!task.read_buffer->eof()) { @@ -267,10 +267,10 @@ void copyDataToAzureBlobStorageFile( const std::function()> & create_read_buffer, size_t offset, size_t size, - MultiVersion & dest_client, + std::shared_ptr dest_client, const String & dest_container_for_logging, const String & dest_blob, - MultiVersion settings, + std::shared_ptr settings, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) { @@ -280,15 +280,15 @@ void copyDataToAzureBlobStorageFile( void copyAzureBlobStorageFile( - MultiVersion & src_client, - MultiVersion & dest_client, + std::shared_ptr src_client, + std::shared_ptr dest_client, const String & src_container_for_logging, const String & src_blob, size_t offset, size_t size, const String & dest_container_for_logging, const String & dest_blob, - MultiVersion settings, + std::shared_ptr settings, const ReadSettings & read_settings, ThreadPoolCallbackRunner schedule, bool for_disk_azure_blob_storage) @@ -300,8 +300,8 @@ void copyAzureBlobStorageFile( if (for_disk_azure_blob_storage) ProfileEvents::increment(ProfileEvents::DiskAzureCopyObject); - auto block_blob_client_src = src_client.get()->GetBlockBlobClient(src_blob); - auto block_blob_client_dest = dest_client.get()->GetBlockBlobClient(dest_blob); + auto block_blob_client_src = src_client->GetBlockBlobClient(src_blob); + auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); auto source_uri = block_blob_client_src.GetUrl(); if (size < max_single_operation_copy_size) @@ -328,7 +328,7 @@ void copyAzureBlobStorageFile( LOG_TRACE(&Poco::Logger::get("copyAzureBlobStorageFile"), "Reading from Container: {}, Blob: {}", src_container_for_logging, src_blob); auto create_read_buffer = [&] { - return std::make_unique(src_client.get(), src_blob, read_settings, settings.get()->max_single_read_retries, + return std::make_unique(src_client, src_blob, read_settings, settings.get()->max_single_read_retries, settings.get()->max_single_download_retries); }; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h index 15a31031f63..83814f42693 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.h @@ -21,15 +21,15 @@ using CreateReadBuffer = std::function()>; /// Copies a file from AzureBlobStorage to AzureBlobStorage. /// The parameters `src_offset` and `src_size` specify a part in the source to copy. void copyAzureBlobStorageFile( - MultiVersion & src_client, - MultiVersion & dest_client, + std::shared_ptr src_client, + std::shared_ptr dest_client, const String & src_container_for_logging, const String & src_blob, size_t src_offset, size_t src_size, const String & dest_container_for_logging, const String & dest_blob, - MultiVersion settings, + std::shared_ptr settings, const ReadSettings & read_settings, ThreadPoolCallbackRunner schedule_ = {}, bool for_disk_azure_blob_storage = false); @@ -44,10 +44,10 @@ void copyDataToAzureBlobStorageFile( const std::function()> & create_read_buffer, size_t offset, size_t size, - MultiVersion & client, + std::shared_ptr client, const String & dest_container_for_logging, const String & dest_blob, - MultiVersion settings, + std::shared_ptr settings, ThreadPoolCallbackRunner schedule_ = {}, bool for_disk_azure_blob_storage = false); From b54be00783c38786370bce20930e626adc8fb3a1 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Sun, 28 Jan 2024 14:36:22 +0100 Subject: [PATCH 152/884] fix build --- contrib/aws-cmake/CMakeLists.txt | 6 +++--- contrib/aws-crt-cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/contrib/aws-cmake/CMakeLists.txt b/contrib/aws-cmake/CMakeLists.txt index b913908911c..abde20addaf 100644 --- a/contrib/aws-cmake/CMakeLists.txt +++ b/contrib/aws-cmake/CMakeLists.txt @@ -25,6 +25,7 @@ include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsFeatureTests.cmake") include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsThreadAffinity.cmake") include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsThreadName.cmake") include("${ClickHouse_SOURCE_DIR}/contrib/aws-cmake/AwsSIMD.cmake") +include("${ClickHouse_SOURCE_DIR}/contrib/aws-crt-cpp/cmake/AwsGetVersion.cmake") # Gather sources and options. @@ -123,9 +124,8 @@ OPTION(USE_AWS_MEMORY_MANAGEMENT "Aws memory management" OFF) configure_file("${AWS_SDK_CORE_DIR}/include/aws/core/SDKConfig.h.in" "${CMAKE_CURRENT_BINARY_DIR}/include/aws/core/SDKConfig.h" @ONLY) -list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_MAJOR=1") -list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_MINOR=10") -list(APPEND AWS_PUBLIC_COMPILE_DEFS "-DAWS_SDK_VERSION_PATCH=36") +aws_get_version(AWS_CRT_CPP_VERSION_MAJOR AWS_CRT_CPP_VERSION_MINOR AWS_CRT_CPP_VERSION_PATCH FULL_VERSION GIT_HASH) +configure_file("${AWS_CRT_DIR}/include/aws/crt/Config.h.in" "${AWS_CRT_DIR}/include/aws/crt/Config.h" @ONLY) list(APPEND AWS_SOURCES ${AWS_SDK_CORE_SRC} ${AWS_SDK_CORE_NET_SRC} ${AWS_SDK_CORE_PLATFORM_SRC}) diff --git a/contrib/aws-crt-cpp b/contrib/aws-crt-cpp index 86adce22528..f532d6abc0d 160000 --- a/contrib/aws-crt-cpp +++ b/contrib/aws-crt-cpp @@ -1 +1 @@ -Subproject commit 86adce22528b811efa5ca27f65d8d5a38223cbfa +Subproject commit f532d6abc0d2b0d8b5d6fe9e7c51eaedbe4afbd0 From 368a26a2aac14da37914b5a25f9537008e48d349 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Sun, 28 Jan 2024 15:48:49 +0100 Subject: [PATCH 153/884] fix build --- contrib/update-submodules.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/update-submodules.sh b/contrib/update-submodules.sh index 7195de020bd..072d7a5dc2f 100755 --- a/contrib/update-submodules.sh +++ b/contrib/update-submodules.sh @@ -24,7 +24,7 @@ git config --file .gitmodules --get-regexp '.*path' | sed 's/[^ ]* //' | xargs - # We don't want to depend on any third-party CMake files. # To check it, find and delete them. grep -o -P '"contrib/[^"]+"' .gitmodules | - grep -v -P 'contrib/(llvm-project|google-protobuf|grpc|abseil-cpp|corrosion)' | + grep -v -P 'contrib/(llvm-project|google-protobuf|grpc|abseil-cpp|corrosion|aws-crt-cpp)' | xargs -I@ find @ \ -'(' -name 'CMakeLists.txt' -or -name '*.cmake' -')' -and -not -name '*.h.cmake' \ -delete From ba85642453915dd57c0cba256b35bf8bec390ea5 Mon Sep 17 00:00:00 2001 From: serxa Date: Sun, 28 Jan 2024 20:26:55 +0000 Subject: [PATCH 154/884] split ISlotControl from ConcurrencyControl --- programs/server/Server.cpp | 2 +- src/Common/ConcurrencyControl.cpp | 28 ++++--- src/Common/ConcurrencyControl.h | 36 ++++----- src/Common/ISlotControl.h | 76 +++++++++++++++++++ .../tests/gtest_concurrency_control.cpp | 28 +++---- src/Processors/Executors/PipelineExecutor.cpp | 14 ++-- src/Processors/Executors/PipelineExecutor.h | 4 +- 7 files changed, 132 insertions(+), 56 deletions(-) create mode 100644 src/Common/ISlotControl.h diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp index 75ec574c357..d6bee995ca4 100644 --- a/programs/server/Server.cpp +++ b/programs/server/Server.cpp @@ -1366,7 +1366,7 @@ try global_context->setMaxDatabaseNumToWarn(new_server_settings.max_database_num_to_warn); global_context->setMaxPartNumToWarn(new_server_settings.max_part_num_to_warn); - ConcurrencyControl::SlotCount concurrent_threads_soft_limit = ConcurrencyControl::Unlimited; + SlotCount concurrent_threads_soft_limit = UnlimitedSlots; if (new_server_settings.concurrent_threads_soft_limit_num > 0 && new_server_settings.concurrent_threads_soft_limit_num < concurrent_threads_soft_limit) concurrent_threads_soft_limit = new_server_settings.concurrent_threads_soft_limit_num; if (new_server_settings.concurrent_threads_soft_limit_ratio_to_cores > 0) diff --git a/src/Common/ConcurrencyControl.cpp b/src/Common/ConcurrencyControl.cpp index c9fe51550dc..0893cfce955 100644 --- a/src/Common/ConcurrencyControl.cpp +++ b/src/Common/ConcurrencyControl.cpp @@ -12,10 +12,10 @@ namespace ErrorCodes ConcurrencyControl::Slot::~Slot() { - allocation->release(); + static_cast(*allocation).release(); } -ConcurrencyControl::Slot::Slot(AllocationPtr && allocation_) +ConcurrencyControl::Slot::Slot(SlotAllocationPtr && allocation_) : allocation(std::move(allocation_)) { } @@ -27,7 +27,7 @@ ConcurrencyControl::Allocation::~Allocation() parent.free(this); } -[[nodiscard]] ConcurrencyControl::SlotPtr ConcurrencyControl::Allocation::tryAcquire() +[[nodiscard]] AcquiredSlotPtr ConcurrencyControl::Allocation::tryAcquire() { SlotCount value = granted.load(); while (value) @@ -35,15 +35,21 @@ ConcurrencyControl::Allocation::~Allocation() if (granted.compare_exchange_strong(value, value - 1)) { std::unique_lock lock{mutex}; - return SlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor + return AcquiredSlotPtr(new Slot(shared_from_this())); // can't use std::make_shared due to private ctor } } return {}; // avoid unnecessary locking } -ConcurrencyControl::SlotCount ConcurrencyControl::Allocation::grantedCount() const +SlotCount ConcurrencyControl::Allocation::grantedCount() const { - return granted; + return granted.load(); +} + +SlotCount ConcurrencyControl::Allocation::allocatedCount() const +{ + std::unique_lock lock{mutex}; + return allocated; } ConcurrencyControl::Allocation::Allocation(ConcurrencyControl & parent_, SlotCount limit_, SlotCount granted_, Waiters::iterator waiter_) @@ -87,7 +93,7 @@ ConcurrencyControl::~ConcurrencyControl() abort(); } -[[nodiscard]] ConcurrencyControl::AllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max) +[[nodiscard]] SlotAllocationPtr ConcurrencyControl::allocate(SlotCount min, SlotCount max) { if (min > max) throw Exception(ErrorCodes::LOGICAL_ERROR, "ConcurrencyControl: invalid allocation requirements"); @@ -100,13 +106,13 @@ ConcurrencyControl::~ConcurrencyControl() // Create allocation and start waiting if more slots are required if (granted < max) - return AllocationPtr(new Allocation(*this, max, granted, + return SlotAllocationPtr(new Allocation(*this, max, granted, waiters.insert(cur_waiter, nullptr /* pointer is set by Allocation ctor */))); else - return AllocationPtr(new Allocation(*this, max, granted)); + return SlotAllocationPtr(new Allocation(*this, max, granted)); } -void ConcurrencyControl::setMaxConcurrency(ConcurrencyControl::SlotCount value) +void ConcurrencyControl::setMaxConcurrency(SlotCount value) { std::unique_lock lock{mutex}; max_concurrency = std::max(1, value); // never allow max_concurrency to be zero @@ -162,7 +168,7 @@ void ConcurrencyControl::schedule(std::unique_lock &) } } -ConcurrencyControl::SlotCount ConcurrencyControl::available(std::unique_lock &) const +SlotCount ConcurrencyControl::available(std::unique_lock &) const { if (cur_concurrency < max_concurrency) return max_concurrency - cur_concurrency; diff --git a/src/Common/ConcurrencyControl.h b/src/Common/ConcurrencyControl.h index 7e20384aa2a..ba94502962c 100644 --- a/src/Common/ConcurrencyControl.h +++ b/src/Common/ConcurrencyControl.h @@ -7,6 +7,7 @@ #include #include +#include namespace DB { @@ -34,41 +35,35 @@ namespace DB * Oversubscription is possible: total amount of allocated slots can exceed `setMaxConcurrency(limit)` * because `min` amount of slots is allocated for each query unconditionally. */ -class ConcurrencyControl : boost::noncopyable +class ConcurrencyControl : public ISlotControl { public: struct Allocation; - using AllocationPtr = std::shared_ptr; - using SlotCount = UInt64; using Waiters = std::list; - static constexpr SlotCount Unlimited = std::numeric_limits::max(); - // Scoped guard for acquired slot, see Allocation::tryAcquire() - struct Slot : boost::noncopyable + struct Slot : public IAcquiredSlot { - ~Slot(); + ~Slot() override; private: friend struct Allocation; // for ctor - explicit Slot(AllocationPtr && allocation_); + explicit Slot(SlotAllocationPtr && allocation_); - AllocationPtr allocation; + SlotAllocationPtr allocation; }; - // FIXME: have to be unique_ptr, but ThreadFromGlobalPool does not support move semantics yet - using SlotPtr = std::shared_ptr; - // Manages group of slots for a single query, see ConcurrencyControl::allocate(min, max) - struct Allocation : std::enable_shared_from_this, boost::noncopyable + struct Allocation : public ISlotAllocation { - ~Allocation(); + ~Allocation() override; // Take one already granted slot if available. Lock-free iff there is no granted slot. - [[nodiscard]] SlotPtr tryAcquire(); + [[nodiscard]] AcquiredSlotPtr tryAcquire() override; - SlotCount grantedCount() const; + SlotCount grantedCount() const override; + SlotCount allocatedCount() const override; private: friend struct Slot; // for release() @@ -94,7 +89,7 @@ public: ConcurrencyControl & parent; const SlotCount limit; - std::mutex mutex; // the following values must be accessed under this mutex + mutable std::mutex mutex; // the following values must be accessed under this mutex SlotCount allocated; // allocated total (including already `released`) SlotCount released = 0; @@ -103,17 +98,16 @@ public: const Waiters::iterator waiter; // iterator to itself in Waiters list; valid iff allocated < limit }; -public: ConcurrencyControl(); // WARNING: all Allocation objects MUST be destructed before ConcurrencyControl // NOTE: Recommended way to achieve this is to use `instance()` and do graceful shutdown of queries - ~ConcurrencyControl(); + ~ConcurrencyControl() override; // Allocate at least `min` and at most `max` slots. // If not all `max` slots were successfully allocated, a subscription for later allocation is created // Use `Allocation::tryAcquire()` to acquire allocated slot, before running a thread. - [[nodiscard]] AllocationPtr allocate(SlotCount min, SlotCount max); + [[nodiscard]] SlotAllocationPtr allocate(SlotCount min, SlotCount max) override; void setMaxConcurrency(SlotCount value); @@ -134,7 +128,7 @@ private: std::mutex mutex; Waiters waiters; Waiters::iterator cur_waiter; // round-robin pointer - SlotCount max_concurrency = Unlimited; + SlotCount max_concurrency = UnlimitedSlots; SlotCount cur_concurrency = 0; }; diff --git a/src/Common/ISlotControl.h b/src/Common/ISlotControl.h new file mode 100644 index 00000000000..add19f0cc0c --- /dev/null +++ b/src/Common/ISlotControl.h @@ -0,0 +1,76 @@ +#pragma once + +#include +#include +#include +#include + + +namespace DB +{ + +// Interfaces for abstract "slot" allocation and control. +// Slot is a virtual entity existing in a limited amount (CPUs or memory chunks, etc). +// +// Every slot can be in one of the following states: +// * free: slot is available to be allocated. +// * allocated: slot is allocated to a specific ISlotAllocation. +// +// Allocated slots can be considered as: +// * granted: allocated, but not yet acquired. +// * acquired: acquired using IAcquiredSlot. +// +// Example for CPU (see ConcurrencyControl.h). Every slot represents one CPU in the system. +// Slot allocation is a request to allocate specific number of CPUs for a specific query. +// Acquired slot is an entity that is held by a thread as long as it is running. This allows +// total number of threads in the system to be limited and the distribution process to be controlled. +// +// TODO: +// - for preemption - ability to return granted slot back and reacquire it later. +// - for memory allocations - variable size of slots (in bytes). + +/// Number of slots +using SlotCount = UInt64; + +/// Unlimited number of slots +constexpr SlotCount UnlimitedSlots = std::numeric_limits::max(); + +/// Acquired slot holder. Slot is considered to be acquired as long the object exists. +class IAcquiredSlot : public std::enable_shared_from_this, boost::noncopyable +{ +public: + virtual ~IAcquiredSlot() = default; +}; + +using AcquiredSlotPtr = std::shared_ptr; + +/// Request for allocation of slots from ISlotControl. +/// Allows for more slots to be acquired and the whole request to be canceled. +class ISlotAllocation : public std::enable_shared_from_this, boost::noncopyable +{ +public: + virtual ~ISlotAllocation() = default; + + /// Take one already granted slot if available. + [[nodiscard]] virtual AcquiredSlotPtr tryAcquire() = 0; + + /// Returns the number of granted slots for given allocation (i.e. available to be acquired) + virtual SlotCount grantedCount() const = 0; + + /// Returns the total number of slots allocated at the moment (acquired and granted) + virtual SlotCount allocatedCount() const = 0; +}; + +using SlotAllocationPtr = std::shared_ptr; + +class ISlotControl : boost::noncopyable +{ +public: + virtual ~ISlotControl() = default; + + // Allocate at least `min` and at most `max` slots. + // If not all `max` slots were successfully allocated, a "subscription" for later allocation is created + [[nodiscard]] virtual SlotAllocationPtr allocate(SlotCount min, SlotCount max) = 0; +}; + +} diff --git a/src/Common/tests/gtest_concurrency_control.cpp b/src/Common/tests/gtest_concurrency_control.cpp index 8e5b89a72a0..5e579317ade 100644 --- a/src/Common/tests/gtest_concurrency_control.cpp +++ b/src/Common/tests/gtest_concurrency_control.cpp @@ -15,7 +15,7 @@ struct ConcurrencyControlTest { ConcurrencyControl cc; - explicit ConcurrencyControlTest(ConcurrencyControl::SlotCount limit = ConcurrencyControl::Unlimited) + explicit ConcurrencyControlTest(SlotCount limit = UnlimitedSlots) { cc.setMaxConcurrency(limit); } @@ -25,7 +25,7 @@ TEST(ConcurrencyControl, Unlimited) { ConcurrencyControlTest t; // unlimited number of slots auto slots = t.cc.allocate(0, 100500); - std::vector acquired; + std::vector acquired; while (auto slot = slots->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 100500); @@ -34,14 +34,14 @@ TEST(ConcurrencyControl, Unlimited) TEST(ConcurrencyControl, Fifo) { ConcurrencyControlTest t(1); // use single slot - std::vector allocations; + std::vector allocations; constexpr int count = 42; allocations.reserve(count); for (int i = 0; i < count; i++) allocations.emplace_back(t.cc.allocate(0, 1)); for (int i = 0; i < count; i++) { - ConcurrencyControl::SlotPtr holder; + AcquiredSlotPtr holder; for (int j = 0; j < count; j++) { auto slot = allocations[j]->tryAcquire(); @@ -60,11 +60,11 @@ TEST(ConcurrencyControl, Fifo) TEST(ConcurrencyControl, Oversubscription) { ConcurrencyControlTest t(10); - std::vector allocations; + std::vector allocations; allocations.reserve(10); for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); - std::vector slots; + std::vector slots; // Normal allocation using maximum amount of slots for (int i = 0; i < 5; i++) { @@ -90,7 +90,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots) { ConcurrencyControlTest t(10); { - std::vector allocations; + std::vector allocations; allocations.reserve(10); for (int i = 0; i < 10; i++) allocations.emplace_back(t.cc.allocate(1, 2)); @@ -98,7 +98,7 @@ TEST(ConcurrencyControl, ReleaseUnacquiredSlots) } // Check that slots were actually released auto allocation = t.cc.allocate(0, 20); - std::vector acquired; + std::vector acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); @@ -110,7 +110,7 @@ TEST(ConcurrencyControl, DestroyNotFullyAllocatedAllocation) for (int i = 0; i < 3; i++) { auto allocation = t.cc.allocate(5, 20); - std::vector acquired; + std::vector acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); @@ -122,7 +122,7 @@ TEST(ConcurrencyControl, DestroyAllocationBeforeSlots) ConcurrencyControlTest t(10); for (int i = 0; i < 3; i++) { - std::vector acquired; + std::vector acquired; auto allocation = t.cc.allocate(5, 20); while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); @@ -135,7 +135,7 @@ TEST(ConcurrencyControl, GrantReleasedToTheSameAllocation) { ConcurrencyControlTest t(3); auto allocation = t.cc.allocate(0, 10); - std::list acquired; + std::list acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 3); // 0 1 2 @@ -183,7 +183,7 @@ TEST(ConcurrencyControl, SetSlotCount) { ConcurrencyControlTest t(10); auto allocation = t.cc.allocate(5, 30); - std::vector acquired; + std::vector acquired; while (auto slot = allocation->tryAcquire()) acquired.emplace_back(std::move(slot)); ASSERT_TRUE(acquired.size() == 10); @@ -200,7 +200,7 @@ TEST(ConcurrencyControl, SetSlotCount) ASSERT_TRUE(acquired.size() == 5); // Check that newly added slots are equally distributed over waiting allocations - std::vector acquired2; + std::vector acquired2; auto allocation2 = t.cc.allocate(0, 30); ASSERT_TRUE(!allocation->tryAcquire()); t.cc.setMaxConcurrency(15); // 10 slots added: 5 to the first allocation and 5 to the second one @@ -224,7 +224,7 @@ TEST(ConcurrencyControl, MultipleThreads) auto run_query = [&] (size_t max_threads) { - ConcurrencyControl::AllocationPtr slots = t.cc.allocate(1, max_threads); + SlotAllocationPtr slots = t.cc.allocate(1, max_threads); std::mutex threads_mutex; std::vector threads; threads.reserve(max_threads); diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp index 580aaa2b259..a06bacd7d3b 100644 --- a/src/Processors/Executors/PipelineExecutor.cpp +++ b/src/Processors/Executors/PipelineExecutor.cpp @@ -138,8 +138,8 @@ bool PipelineExecutor::executeStep(std::atomic_bool * yield_flag) initializeExecution(1, true); // Acquire slot until we are done - single_thread_slot = slots->tryAcquire(); - chassert(single_thread_slot && "Unable to allocate slot for the first thread, but we just allocated at least one slot"); + single_thread_cpu_slot = cpu_slots->tryAcquire(); + chassert(single_thread_cpu_slot && "Unable to allocate cpu slot for the first thread, but we just allocated at least one slot"); if (yield_flag && *yield_flag) return true; @@ -155,7 +155,7 @@ bool PipelineExecutor::executeStep(std::atomic_bool * yield_flag) if (node->exception) std::rethrow_exception(node->exception); - single_thread_slot.reset(); + single_thread_cpu_slot.reset(); finalizeExecution(); return false; @@ -333,8 +333,8 @@ void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_ /// Allocate CPU slots from concurrency control size_t min_threads = concurrency_control ? 1uz : num_threads; - slots = ConcurrencyControl::instance().allocate(min_threads, num_threads); - use_threads = slots->grantedCount(); + cpu_slots = ConcurrencyControl::instance().allocate(min_threads, num_threads); + use_threads = cpu_slots->grantedCount(); Queue queue; graph->initializeExecution(queue); @@ -348,7 +348,7 @@ void PipelineExecutor::initializeExecution(size_t num_threads, bool concurrency_ void PipelineExecutor::spawnThreads() { - while (auto slot = slots->tryAcquire()) + while (auto slot = cpu_slots->tryAcquire()) { size_t thread_num = threads.fetch_add(1); @@ -405,7 +405,7 @@ void PipelineExecutor::executeImpl(size_t num_threads, bool concurrency_control) } else { - auto slot = slots->tryAcquire(); + auto slot = cpu_slots->tryAcquire(); executeSingleThread(0); } diff --git a/src/Processors/Executors/PipelineExecutor.h b/src/Processors/Executors/PipelineExecutor.h index 862a460f0ed..cb74b524163 100644 --- a/src/Processors/Executors/PipelineExecutor.h +++ b/src/Processors/Executors/PipelineExecutor.h @@ -68,8 +68,8 @@ private: ExecutorTasks tasks; /// Concurrency control related - ConcurrencyControl::AllocationPtr slots; - ConcurrencyControl::SlotPtr single_thread_slot; // slot for single-thread mode to work using executeStep() + SlotAllocationPtr cpu_slots; + AcquiredSlotPtr single_thread_cpu_slot; // cpu slot for single-thread mode to work using executeStep() std::unique_ptr pool; std::atomic_size_t threads = 0; From 151ade2318f38adc5b732423a1ee1d228e1e5966 Mon Sep 17 00:00:00 2001 From: Sergei Trifonov Date: Sun, 28 Jan 2024 21:38:21 +0100 Subject: [PATCH 155/884] Update src/Common/ISlotControl.h --- src/Common/ISlotControl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Common/ISlotControl.h b/src/Common/ISlotControl.h index add19f0cc0c..aa7414d5465 100644 --- a/src/Common/ISlotControl.h +++ b/src/Common/ISlotControl.h @@ -35,7 +35,7 @@ using SlotCount = UInt64; /// Unlimited number of slots constexpr SlotCount UnlimitedSlots = std::numeric_limits::max(); -/// Acquired slot holder. Slot is considered to be acquired as long the object exists. +/// Acquired slot holder. Slot is considered to be acquired as long as the object exists. class IAcquiredSlot : public std::enable_shared_from_this, boost::noncopyable { public: From 8798f469b3a0e38341e759f5cc98ca86b8220069 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Jan 2024 22:27:21 +0100 Subject: [PATCH 156/884] Fix conflicts. --- tests/ci/ci_config.py | 47 +++++++------------------------------------ 1 file changed, 7 insertions(+), 40 deletions(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 0fc4b3505ce..8c8c45b877c 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -50,6 +50,7 @@ class JobNames(metaclass=WithIter): STATELESS_TEST_DEBUG = "Stateless tests (debug)" STATELESS_TEST_RELEASE = "Stateless tests (release)" + STATELESS_TEST_RELEASE_COVERAGE = "Stateless tests (coverage)" STATELESS_TEST_AARCH64 = "Stateless tests (aarch64)" STATELESS_TEST_ASAN = "Stateless tests (asan)" STATELESS_TEST_TSAN = "Stateless tests (tsan)" @@ -64,6 +65,7 @@ class JobNames(metaclass=WithIter): STATEFUL_TEST_DEBUG = "Stateful tests (debug)" STATEFUL_TEST_RELEASE = "Stateful tests (release)" + STATEFUL_TEST_RELEASE_COVERAGE = "Stateful tests (coverage)" STATEFUL_TEST_AARCH64 = "Stateful tests (aarch64)" STATEFUL_TEST_ASAN = "Stateful tests (asan)" STATEFUL_TEST_TSAN = "Stateful tests (tsan)" @@ -763,18 +765,6 @@ CI_CONFIG = CIConfig( builds_report_config={ JobNames.BUILD_CHECK: BuildReportConfig( builds=[ -<<<<<<< HEAD - "package_release", - "package_aarch64", - "package_asan", - "package_ubsan", - "package_tsan", - "package_msan", - "package_debug", - "package_release_coverage", - "binary_release", - "fuzzers", -======= Build.PACKAGE_RELEASE, Build.PACKAGE_AARCH64, Build.PACKAGE_ASAN, @@ -782,9 +772,9 @@ CI_CONFIG = CIConfig( Build.PACKAGE_TSAN, Build.PACKAGE_MSAN, Build.PACKAGE_DEBUG, + Build.PACKAGE_RELEASE_COVERAGE, Build.BINARY_RELEASE, Build.FUZZERS, ->>>>>>> master ] ), JobNames.BUILD_CHECK_SPECIAL: BuildReportConfig( @@ -872,33 +862,15 @@ CI_CONFIG = CIConfig( JobNames.STATEFUL_TEST_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), -<<<<<<< HEAD - "Stateful tests (coverage)": TestConfig( - "package_release_coverage", job_config=JobConfig(**stateful_test_common_params) # type: ignore + JobNames.STATEFUL_TEST_RELEASE_COVERAGE: TestConfig( + Build.PACKAGE_RELEASE_COVERAGE, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), - "Stateful tests (aarch64)": TestConfig( - "package_aarch64", job_config=JobConfig(**stateful_test_common_params) # type: ignore - ), - "Stateful tests (release, DatabaseOrdinary)": TestConfig( - "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore - ), - "Stateful tests (release, ParallelReplicas)": TestConfig( - "package_release", job_config=JobConfig(**stateful_test_common_params) # type: ignore -======= JobNames.STATEFUL_TEST_AARCH64: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig(**stateful_test_common_params) # type: ignore ), - # FIXME: delete? - # "Stateful tests (release, DatabaseOrdinary)": TestConfig( - # Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore - # ), - # "Stateful tests (release, DatabaseReplicated)": TestConfig( - # Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore - # ), # Stateful tests for parallel replicas JobNames.STATEFUL_TEST_PARALLEL_REPL_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**stateful_test_common_params) # type: ignore ->>>>>>> master ), JobNames.STATEFUL_TEST_PARALLEL_REPL_DEBUG: TestConfig( Build.PACKAGE_DEBUG, job_config=JobConfig(**stateful_test_common_params) # type: ignore @@ -939,16 +911,11 @@ CI_CONFIG = CIConfig( JobNames.STATELESS_TEST_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore ), -<<<<<<< HEAD - "Stateless tests (coverage)": TestConfig( - "package_release_coverage", job_config=JobConfig(**statless_test_common_params) # type: ignore + JobNames.STATELESS_TEST_RELEASE_COVERAGE: TestConfig( + Build.PACKAGE_RELEASE_COVERAGE, job_config=JobConfig(**statless_test_common_params) # type: ignore ), - "Stateless tests (aarch64)": TestConfig( - "package_aarch64", job_config=JobConfig(**statless_test_common_params) # type: ignore -======= JobNames.STATELESS_TEST_AARCH64: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig(**statless_test_common_params) # type: ignore ->>>>>>> master ), JobNames.STATELESS_TEST_ANALYZER_RELEASE: TestConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore From 1ef8062c7701c1788abda4d5da7ee56a5b9de372 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Jan 2024 22:28:09 +0100 Subject: [PATCH 157/884] Split by batches --- tests/ci/ci_config.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 8c8c45b877c..0cfddbe0435 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -912,7 +912,8 @@ CI_CONFIG = CIConfig( Build.PACKAGE_RELEASE, job_config=JobConfig(**statless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_RELEASE_COVERAGE: TestConfig( - Build.PACKAGE_RELEASE_COVERAGE, job_config=JobConfig(**statless_test_common_params) # type: ignore + Build.PACKAGE_RELEASE_COVERAGE, + job_config=JobConfig(num_batches=6, **statless_test_common_params) # type: ignore ), JobNames.STATELESS_TEST_AARCH64: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig(**statless_test_common_params) # type: ignore From 982e3ddbddb0baaa2ac11f9dec74a8be8c8e8545 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Jan 2024 23:11:03 +0100 Subject: [PATCH 158/884] Fix Python --- tests/ci/ci_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/ci/ci_config.py b/tests/ci/ci_config.py index 0cfddbe0435..7458f25805f 100644 --- a/tests/ci/ci_config.py +++ b/tests/ci/ci_config.py @@ -913,7 +913,7 @@ CI_CONFIG = CIConfig( ), JobNames.STATELESS_TEST_RELEASE_COVERAGE: TestConfig( Build.PACKAGE_RELEASE_COVERAGE, - job_config=JobConfig(num_batches=6, **statless_test_common_params) # type: ignore + job_config=JobConfig(num_batches=6, **statless_test_common_params), # type: ignore ), JobNames.STATELESS_TEST_AARCH64: TestConfig( Build.PACKAGE_AARCH64, job_config=JobConfig(**statless_test_common_params) # type: ignore From 30f48e18938bbc5683d781f1cbfe7bfcf3fec8d9 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Sun, 28 Jan 2024 23:54:35 +0100 Subject: [PATCH 159/884] Use MergeTree as a default table engine --- src/Core/Settings.h | 2 +- src/Core/SettingsChangesHistory.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index e0b3ca39899..4460a365846 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -558,7 +558,7 @@ class IColumn; M(UInt64, min_free_disk_space_for_temporary_data, 0, "The minimum disk space to keep while writing temporary data used in external sorting and aggregation.", 0) \ \ M(DefaultTableEngine, default_temporary_table_engine, DefaultTableEngine::Memory, "Default table engine used when ENGINE is not set in CREATE TEMPORARY statement.",0) \ - M(DefaultTableEngine, default_table_engine, DefaultTableEngine::None, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ + M(DefaultTableEngine, default_table_engine, DefaultTableEngine::MergeTree, "Default table engine used when ENGINE is not set in CREATE statement.",0) \ M(Bool, show_table_uuid_in_table_create_query_if_not_nil, false, "For tables in databases with Engine=Atomic show UUID of the table in its CREATE query.", 0) \ M(Bool, database_atomic_wait_for_drop_and_detach_synchronously, false, "When executing DROP or DETACH TABLE in Atomic database, wait for table data to be finally dropped or detached.", 0) \ M(Bool, enable_scalar_subquery_optimization, true, "If it is set to true, prevent scalar subqueries from (de)serializing large scalar values and possibly avoid running the same subquery more than once.", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index dff0ebb759c..7bdab886934 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -100,6 +100,7 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, + {"default_table_engine", DefaultTableEngine::None, DefaultTableEngine::MergeTree, "Set default table engine to MergeTree for better usability"}, {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, From a22b68f46fec54f98fc3c3cb9a9c1f597bae7ffd Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 29 Jan 2024 10:49:36 +0100 Subject: [PATCH 160/884] Added setting azure_max_single_part_copy_size --- src/Core/Settings.h | 3 ++- .../AzureBlobStorage/AzureBlobStorageAuth.cpp | 4 +++- .../ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 3 +++ src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp | 6 +----- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index 305d6466658..4ae5d1585f3 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -82,7 +82,8 @@ class IColumn; M(UInt64, s3_upload_part_size_multiply_parts_count_threshold, 500, "Each time this number of parts was uploaded to S3, s3_min_upload_part_size is multiplied by s3_upload_part_size_multiply_factor.", 0) \ M(UInt64, s3_max_inflight_parts_for_one_file, 20, "The maximum number of a concurrent loaded parts in multipart upload request. 0 means unlimited. You ", 0) \ M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \ - M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ + M(UInt64, azure_max_single_part_upload_size, 100*1024*1024, "The maximum size of object to upload using singlepart upload to Azure blob storage.", 0) \ + M(UInt64, azure_max_single_part_copy_size, 256*1024*1024, "The maximum size of object to copy using single part copy to Azure blob storage.", 0) \ M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \ M(UInt64, azure_max_single_read_retries, 4, "The maximum number of retries during single Azure blob storage read.", 0) \ M(UInt64, s3_max_unexpected_write_error_retries, 4, "The maximum number of retries in case of unexpected errors during S3 write.", 0) \ diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp index 02b0d5bb599..9da84d430e4 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureBlobStorageAuth.cpp @@ -7,6 +7,7 @@ #include #include #include +#include using namespace Azure::Storage::Blobs; @@ -157,7 +158,7 @@ std::unique_ptr getAzureBlobContainerClient( } } -std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr /*context*/) +std::unique_ptr getAzureBlobStorageSettings(const Poco::Util::AbstractConfiguration & config, const String & config_prefix, ContextPtr context) { return std::make_unique( config.getUInt64(config_prefix + ".max_single_part_upload_size", 100 * 1024 * 1024), @@ -166,6 +167,7 @@ std::unique_ptr getAzureBlobStorageSettings(const Po config.getInt(config_prefix + ".max_single_download_retries", 3), config.getInt(config_prefix + ".list_object_keys_size", 1000), config.getUInt64(config_prefix + ".max_upload_part_size", 5ULL * 1024 * 1024 * 1024), + config.getUInt64(config_prefix + ".max_single_part_copy_size", context->getSettings().azure_max_single_part_copy_size), config.getBool(config_prefix + ".use_native_copy", false) ); } diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 0ae12fb205f..18b1a70defe 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -25,6 +25,7 @@ struct AzureObjectStorageSettings int max_single_download_retries_, int list_object_keys_size_, size_t max_upload_part_size_, + size_t max_single_part_copy_size_, bool use_native_copy_) : max_single_part_upload_size(max_single_part_upload_size_) , min_bytes_for_seek(min_bytes_for_seek_) @@ -32,6 +33,7 @@ struct AzureObjectStorageSettings , max_single_download_retries(max_single_download_retries_) , list_object_keys_size(list_object_keys_size_) , max_upload_part_size(max_upload_part_size_) + , max_single_part_copy_size(max_single_part_copy_size_) , use_native_copy(use_native_copy_) { } @@ -46,6 +48,7 @@ struct AzureObjectStorageSettings size_t min_upload_part_size = 16 * 1024 * 1024; size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; size_t max_part_number = 10000; + size_t max_single_part_copy_size = 256 * 1024 * 1024; bool use_native_copy = false; }; diff --git a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp index 537a5a191e7..ff4cfe62feb 100644 --- a/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp +++ b/src/IO/AzureBlobStorage/copyAzureBlobStorageFile.cpp @@ -33,10 +33,6 @@ namespace ErrorCodes extern const int AZURE_BLOB_STORAGE_ERROR; } - -size_t max_single_operation_copy_size = 256 * 1024 * 1024; - - namespace { class UploadHelper @@ -304,7 +300,7 @@ void copyAzureBlobStorageFile( auto block_blob_client_dest = dest_client->GetBlockBlobClient(dest_blob); auto source_uri = block_blob_client_src.GetUrl(); - if (size < max_single_operation_copy_size) + if (size < settings.get()->max_single_part_copy_size) { block_blob_client_dest.CopyFromUri(source_uri); } From 99a1b269d71054a1d4d1e59a55b229469652435c Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 29 Jan 2024 11:00:59 +0100 Subject: [PATCH 161/884] Removed unwanted setting --- src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 1 - 1 file changed, 1 deletion(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 18b1a70defe..7d5c8f07a75 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -47,7 +47,6 @@ struct AzureObjectStorageSettings int list_object_keys_size = 1000; size_t min_upload_part_size = 16 * 1024 * 1024; size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; - size_t max_part_number = 10000; size_t max_single_part_copy_size = 256 * 1024 * 1024; bool use_native_copy = false; }; From ce0ebd964519d0961d92318e8a171d5338365213 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 29 Jan 2024 11:14:19 +0100 Subject: [PATCH 162/884] Removed unwanted log lines --- src/Backups/BackupImpl.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/Backups/BackupImpl.cpp b/src/Backups/BackupImpl.cpp index 0fb0d8cbda9..28a7d60b52c 100644 --- a/src/Backups/BackupImpl.cpp +++ b/src/Backups/BackupImpl.cpp @@ -535,7 +535,6 @@ void BackupImpl::checkBackupDoesntExist() const else file_name_to_check_existence = ".backup"; - LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkBackupDoesntExist 1"); if (writer->fileExists(file_name_to_check_existence)) throw Exception(ErrorCodes::BACKUP_ALREADY_EXISTS, "Backup {} already exists", backup_name_for_logging); @@ -567,8 +566,6 @@ bool BackupImpl::checkLockFile(bool throw_if_failed) const if (throw_if_failed) { - LOG_INFO(&Poco::Logger::get("BackupImpl"), "checkLockFile"); - if (!writer->fileExists(lock_file_name)) { throw Exception( From 1ab29bef622a8de3af7bec194598e3939c9f2d7a Mon Sep 17 00:00:00 2001 From: yariks5s Date: Mon, 29 Jan 2024 15:33:09 +0000 Subject: [PATCH 163/884] fixes --- src/Functions/FunctionBinaryArithmetic.h | 54 ++++++++++++++----- src/Functions/IsOperation.h | 6 +-- .../00700_decimal_arithm.reference | 14 ++--- .../01717_int_div_float_too_large_ubsan.sql | 4 +- .../02975_intdiv_with_decimal.reference | 52 +++++++++++------- .../0_stateless/02975_intdiv_with_decimal.sql | 16 ++++++ 6 files changed, 101 insertions(+), 45 deletions(-) diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index e34514d15fd..831c1cf3aeb 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -153,17 +153,18 @@ public: using ResultDataType = Switch< /// Result must be Integer Case< - only_integer && IsDataTypeDecimal && IsDataTypeDecimal, + only_integer && (IsDataTypeDecimal || IsDataTypeDecimal), Switch< - Case || std::is_same_v, DataTypeInt256>, - Case || std::is_same_v, DataTypeInt128>, - Case || std::is_same_v, DataTypeInt64>, - Case || std::is_same_v, DataTypeInt32>>>, - Case< - only_integer, - Switch< - Case, LeftDataType>, - Case, RightDataType>>>, + Case< + IsDataTypeDecimal || IsDataTypeDecimal, + Switch< + Case, LeftDataType>, + Case, RightDataType>, + Case || std::is_same_v, DataTypeInt256>, + Case || std::is_same_v, DataTypeInt128>, + Case || std::is_same_v, DataTypeInt64>, + Case || std::is_same_v, DataTypeInt32>>>>>, + /// Decimal cases Case || IsDataTypeDecimal), InvalidType>, Case< @@ -1713,12 +1714,37 @@ public: type_res = std::make_shared(result_type.getPrecision(), result_type.getScale()); } } - else if constexpr ((IsDataTypeDecimal && IsFloatingPoint) || - (IsDataTypeDecimal && IsFloatingPoint)) - type_res = std::make_shared(); + else if constexpr (((IsDataTypeDecimal && IsFloatingPoint) || + (IsDataTypeDecimal && IsFloatingPoint)) && !(is_div_int || is_div_int_or_zero)) + { + if constexpr ((is_div_int || is_div_int_or_zero) && IsDataTypeDecimal) + { + if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v || std::is_same_v) + type_res = std::make_shared(); + else + type_res = std::make_shared(); + } + else if constexpr (is_div_int || is_div_int_or_zero) + { + if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v) + type_res = std::make_shared(); + else if constexpr (std::is_same_v || std::is_same_v) + type_res = std::make_shared(); + else + type_res = std::make_shared(); + } + else + type_res = std::make_shared(); + } else if constexpr (IsDataTypeDecimal) { - if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegral) + if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegralOrExtended) type_res = std::make_shared(); else if constexpr (is_div_int || is_div_int_or_zero) { diff --git a/src/Functions/IsOperation.h b/src/Functions/IsOperation.h index b36530591ef..b2c7a27d375 100644 --- a/src/Functions/IsOperation.h +++ b/src/Functions/IsOperation.h @@ -61,10 +61,8 @@ struct IsOperation static constexpr bool bit_hamming_distance = IsSameOperation::value; static constexpr bool division = div_floating || div_int || div_int_or_zero || modulo; - - static constexpr bool division_allow_decimal = div_floating || modulo; - - static constexpr bool allow_decimal = plus || minus || multiply || division_allow_decimal || least || greatest; + // NOTE: allow_decimal should not fully contain `division` because of divInt + static constexpr bool allow_decimal = plus || minus || multiply || division || least || greatest; }; } diff --git a/tests/queries/0_stateless/00700_decimal_arithm.reference b/tests/queries/0_stateless/00700_decimal_arithm.reference index 811946c87e0..20f04696b1b 100644 --- a/tests/queries/0_stateless/00700_decimal_arithm.reference +++ b/tests/queries/0_stateless/00700_decimal_arithm.reference @@ -10,18 +10,18 @@ 63 21 -42 882 -882 2 0 2 0 63 21 -42 882 -882 2 0 2 0 1.00305798474369219219752355409390731264 -0.16305798474369219219752355409390731264 1.490591730234615865843651857942052864 -1.38847100762815390390123822295304634368 1.38847100762815390390123822295304634368 0.02 0.005 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2.02 0.505 -63.42 21.42 -41.58 890.82 -890.82 2.02 0.5 2.02 0.5 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.505 2 0 +63.42 21.42 -41.58 890.82 -890.82 2.02 0.5 2 0 63 -21 42 882 -882 0 2 0 2 63 -21 42 882 -882 0 2 0 2 63 -21 42 882 -882 0 2 0 2 1.00305798474369219219752355409390731264 0.16305798474369219219752355409390731264 -1.490591730234615865843651857942052864 -1.38847100762815390390123822295304634368 1.38847100762815390390123822295304634368 -0.00000000000000000000000000000000000001 0.00000000000000000000000000000000000001 -63.42 -21.42 41.58 890.82 -890.82 0.495 1.98 0.495 1.98 +63.42 -21.42 41.58 890.82 -890.82 0.495 1.98 0 2 63.42 -21.42 41.58 890.82 -890.82 -63.42 -21.42 41.58 890.82 -890.82 0.495049504950495049 1.980198019801980198 0.495049504950495049 1.980198019801980198 -63.42 -21.42 41.58 890.82 -890.82 0.49 1.98 0.49 1.98 +63.42 -21.42 41.58 890.82 -890.82 0.495049504950495049 1.980198019801980198 0 2 +63.42 -21.42 41.58 890.82 -890.82 0.49 1.98 0 2 -42 42 42 42 0.42 0.42 0.42 42.42 42.42 42.42 0 0 0 0 0 0 0 0 0 0 42 -42 -42 -42 -0.42 -0.42 -0.42 -42.42 -42.42 -42.42 diff --git a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql index c4f26a079f0..dc1e5b37050 100644 --- a/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql +++ b/tests/queries/0_stateless/01717_int_div_float_too_large_ubsan.sql @@ -1,2 +1,2 @@ -SELECT intDiv(9223372036854775807, 0.9998999834060669); -- { serverError 153 } -SELECT intDiv(9223372036854775807, 1.); -- { serverError 153 } +SELECT intDiv(18446744073709551615, 0.9998999834060669); -- { serverError 153 } +SELECT intDiv(18446744073709551615, 1.); -- { serverError 153 } diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.reference b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference index 9c1faab21d7..594dcee975a 100644 --- a/tests/queries/0_stateless/02975_intdiv_with_decimal.reference +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.reference @@ -24,28 +24,44 @@ 2 2 2 +1 +1 +1 +1 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 +2 2 2 2 2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 -2 +1 +1 +1 +1 2 2 2 diff --git a/tests/queries/0_stateless/02975_intdiv_with_decimal.sql b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql index 8fc4b5a9a7d..18e657caa8a 100644 --- a/tests/queries/0_stateless/02975_intdiv_with_decimal.sql +++ b/tests/queries/0_stateless/02975_intdiv_with_decimal.sql @@ -25,6 +25,14 @@ SELECT intDiv(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); SELECT intDiv(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); SELECT intDiv(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); SELECT intDiv(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDiv(4.2, toDecimal32(2.2, 2)); +SELECT intDiv(4.2, toDecimal64(2.2, 2)); +SELECT intDiv(4.2, toDecimal128(2.2, 2)); +SELECT intDiv(4.2, toDecimal256(2.2, 2)); +SELECT intDiv(toDecimal32(4.4, 2), 2.2); +SELECT intDiv(toDecimal64(4.4, 2), 2.2); +SELECT intDiv(toDecimal128(4.4, 2), 2.2); +SELECT intDiv(toDecimal256(4.4, 2), 2.2); --intDivOrZero-- SELECT intDivOrZero(4,2); SELECT intDivOrZero(toDecimal32(4.4, 2), 2); @@ -52,3 +60,11 @@ SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal128(2.2, 2)); SELECT intDivOrZero(toDecimal64(4.4, 2), toDecimal256(2.2, 2)); SELECT intDivOrZero(toDecimal128(4.4, 2), toDecimal256(2.2, 2)); SELECT intDivOrZero(toDecimal256(4.4, 2), toDecimal256(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal32(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal64(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal128(2.2, 2)); +SELECT intDivOrZero(4.2, toDecimal256(2.2, 2)); +SELECT intDivOrZero(toDecimal32(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal64(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal128(4.4, 2), 2.2); +SELECT intDivOrZero(toDecimal256(4.4, 2), 2.2); From 6bfa910d9ea403e91fb9be04573c73bfae77b4c4 Mon Sep 17 00:00:00 2001 From: Smita Kulkarni Date: Mon, 29 Jan 2024 16:47:02 +0100 Subject: [PATCH 164/884] Fix merge --- src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h index 78a67f3e59a..8556f0237e3 100644 --- a/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h +++ b/src/Disks/ObjectStorages/AzureBlobStorage/AzureObjectStorage.h @@ -47,12 +47,10 @@ struct AzureObjectStorageSettings size_t max_single_read_retries = 3; size_t max_single_download_retries = 3; int list_object_keys_size = 1000; -<<<<<<< HEAD size_t min_upload_part_size = 16 * 1024 * 1024; size_t max_upload_part_size = 5ULL * 1024 * 1024 * 1024; size_t max_single_part_copy_size = 256 * 1024 * 1024; bool use_native_copy = false; -======= size_t max_unexpected_write_error_retries = 4; >>>>>>> master }; From 4a8a7208f2a21236de1fa5140a2980a9bdf98974 Mon Sep 17 00:00:00 2001 From: Blargian Date: Mon, 29 Jan 2024 21:25:58 +0100 Subject: [PATCH 165/884] rename of settings, add setting for resultset, extend test, fix documentation and add to SettingsChanges log --- docs/en/interfaces/formats.md | 6 +-- .../operations/settings/settings-formats.md | 6 ++- docs/ru/interfaces/formats.md | 6 +-- src/Core/Settings.h | 3 +- src/Core/SettingsChangesHistory.h | 4 +- src/Formats/FormatFactory.cpp | 3 +- src/Formats/FormatSettings.h | 3 +- .../Impl/TemplateBlockOutputFormat.cpp | 39 ++++++++++++------- ...0937_format_schema_rows_template.reference | 5 +++ .../00937_format_schema_rows_template.sh | 24 ++++++++++-- 10 files changed, 71 insertions(+), 28 deletions(-) diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md index a3f54c1c383..0f597282f9e 100644 --- a/docs/en/interfaces/formats.md +++ b/docs/en/interfaces/formats.md @@ -253,7 +253,7 @@ This format is also available under the name `TSVRawWithNamesAndNames`. This format allows specifying a custom format string with placeholders for values with a specified escaping rule. -It uses settings `format_template_resultset`, `format_template_row` (`format_schema_rows_template`), `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) +It uses settings `format_template_resultset`, `format_template_row` (`format_template_row_format`), `format_template_rows_between_delimiter` and some settings of other formats (e.g. `output_format_json_quote_64bit_integers` when using `JSON` escaping, see further) Setting `format_template_row` specifies the path to the file containing format strings for rows with the following syntax: @@ -279,11 +279,11 @@ the values of `SearchPhrase`, `c` and `price` columns, which are escaped as `Quo `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` -In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluster, or if the format is trivial then `format_schema_rows_template` can be used to pass the template string directly in the query, rather than a path to the file which contains it. +In cases where it is challenging or not possible to deploy format output configuration for the template format to a directory on all nodes in a cluster, or if the format is trivial then `format_template_row_format` can be used to set the template string directly in the query, rather than a path to the file which contains it. The `format_template_rows_between_delimiter` setting specifies the delimiter between rows, which is printed (or expected) after every row except the last one (`\n` by default) -Setting `format_template_resultset` specifies the path to the file, which contains a format string for resultset. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: +Setting `format_template_resultset` specifies the path to the file, which contains a format string for resultset. Setting `format_template_resultset_format` can be used to set the template string for the result set directly in the query itself. Format string for resultset has the same syntax as a format string for row and allows to specify a prefix, a suffix and a way to print some additional information. It contains the following placeholders instead of column names: - `data` is the rows with data in `format_template_row` format, separated by `format_template_rows_between_delimiter`. This placeholder must be the first placeholder in the format string. - `totals` is the row with total values in `format_template_row` format (when using WITH TOTALS) diff --git a/docs/en/operations/settings/settings-formats.md b/docs/en/operations/settings/settings-formats.md index 5dedaa2f6ab..816812b1e3a 100644 --- a/docs/en/operations/settings/settings-formats.md +++ b/docs/en/operations/settings/settings-formats.md @@ -1660,6 +1660,10 @@ Result: Path to file which contains format string for result set (for Template format). +### format_template_resultset_format {#format_template_resultset_format} + +Format string for result set (for Template format) + ### format_template_row {#format_template_row} Path to file which contains format string for rows (for Template format). @@ -1668,7 +1672,7 @@ Path to file which contains format string for rows (for Template format). Delimiter between rows (for Template format). -### format_schema_rows_template {#format_schema_rows_template} +### format_template_row_format {#format_template_row_format} Format string for rows (for Template format) diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md index 8f8197e2221..a9280de9c7b 100644 --- a/docs/ru/interfaces/formats.md +++ b/docs/ru/interfaces/formats.md @@ -201,7 +201,7 @@ SELECT * FROM nestedt FORMAT TSV Этот формат позволяет указать произвольную форматную строку, в которую подставляются значения, сериализованные выбранным способом. -Для этого используются настройки `format_template_resultset`, `format_template_row` (`format_schema_rows_template`), `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) +Для этого используются настройки `format_template_resultset`, `format_template_row` (`format_template_row_format`), `format_template_rows_between_delimiter` и настройки экранирования других форматов (например, `output_format_json_quote_64bit_integers` при экранировании как в `JSON`, см. далее) Настройка `format_template_row` задаёт путь к файлу, содержащему форматную строку для строк таблицы, которая должна иметь вид: @@ -227,11 +227,11 @@ SELECT * FROM nestedt FORMAT TSV `Search phrase: 'bathroom interior design', count: 2166, ad price: $3;` -В тех случаях, когда не удобно или не возможно указать произвольную форматную строку в файле, можно использовать `format_schema_rows_template` указать произвольную форматную строку в запросе. +В тех случаях, когда не удобно или не возможно указать произвольную форматную строку в файле, можно использовать `format_template_row_format` указать произвольную форматную строку в запросе. Настройка `format_template_rows_between_delimiter` задаёт разделитель между строками, который выводится (или ожмдается при вводе) после каждой строки, кроме последней. По умолчанию `\n`. -Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: +Настройка `format_template_resultset` задаёт путь к файлу, содержащему форматную строку для результата. Настройка `format_template_resultset_format` используется для установки форматной строки для результата непосредственно в запросе. Форматная строка для результата имеет синтаксис аналогичный форматной строке для строк таблицы и позволяет указать префикс, суффикс и способ вывода дополнительной информации. Вместо имён столбцов в ней указываются следующие имена подстановок: - `data` - строки с данными в формате `format_template_row`, разделённые `format_template_rows_between_delimiter`. Эта подстановка должна быть первой подстановкой в форматной строке. - `totals` - строка с тотальными значениями в формате `format_template_row` (при использовании WITH TOTALS) diff --git a/src/Core/Settings.h b/src/Core/Settings.h index f9e3f401d98..bb946f0d861 100644 --- a/src/Core/Settings.h +++ b/src/Core/Settings.h @@ -1080,7 +1080,8 @@ class IColumn; M(String, format_schema, "", "Schema identifier (used by schema-based formats)", 0) \ M(String, format_template_resultset, "", "Path to file which contains format string for result set (for Template format)", 0) \ M(String, format_template_row, "", "Path to file which contains format string for rows (for Template format)", 0) \ - M(String, format_schema_rows_template, "", "Format string for rows (for Template format)", 0) \ + M(String, format_template_row_format, "", "Format string for rows (for Template format)", 0) \ + M(String, format_template_resultset_format, "", "Format string for result set (for Template format)", 0) \ M(String, format_template_rows_between_delimiter, "\n", "Delimiter between rows (for Template format)", 0) \ \ M(EscapingRule, format_custom_escaping_rule, "Escaped", "Field escaping rule (for CustomSeparated format)", 0) \ diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 859ba99b5f7..8faf43c7e01 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -99,7 +99,9 @@ static std::map sett {"output_format_pretty_color", true, "auto", "Setting is changed to allow also for auto value, disabling ANSI escapes if output is not a tty"}, {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, - {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}}}, + {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, + {"format_template_row_format", "none", "", "Template row format string can be set directly in query"}, + {"format_template_resultset_format", "none", "", "Template result set format string can be set in query"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, {"input_format_parquet_allow_missing_columns", false, true, "Allow missing columns in Parquet files by default"}, {"input_format_orc_allow_missing_columns", false, true, "Allow missing columns in ORC files by default"}, diff --git a/src/Formats/FormatFactory.cpp b/src/Formats/FormatFactory.cpp index 274994b4168..8c39b4b71e4 100644 --- a/src/Formats/FormatFactory.cpp +++ b/src/Formats/FormatFactory.cpp @@ -166,7 +166,8 @@ FormatSettings getFormatSettings(ContextPtr context, const Settings & settings) format_settings.template_settings.resultset_format = settings.format_template_resultset; format_settings.template_settings.row_between_delimiter = settings.format_template_rows_between_delimiter; format_settings.template_settings.row_format = settings.format_template_row; - format_settings.template_settings.row_format_schema = settings.format_schema_rows_template; + format_settings.template_settings.row_format_template = settings.format_template_row_format; + format_settings.template_settings.resultset_format_template = settings.format_template_resultset_format; format_settings.tsv.crlf_end_of_line = settings.output_format_tsv_crlf_end_of_line; format_settings.tsv.empty_as_default = settings.input_format_tsv_empty_as_default; format_settings.tsv.enum_as_number = settings.input_format_tsv_enum_as_number; diff --git a/src/Formats/FormatSettings.h b/src/Formats/FormatSettings.h index 28a2076af84..bdd2dda5287 100644 --- a/src/Formats/FormatSettings.h +++ b/src/Formats/FormatSettings.h @@ -338,7 +338,8 @@ struct FormatSettings String resultset_format; String row_format; String row_between_delimiter; - String row_format_schema; + String row_format_template; + String resultset_format_template; } template_settings; struct diff --git a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp index efda754917b..1c43a0fa331 100644 --- a/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp +++ b/src/Processors/Formats/Impl/TemplateBlockOutputFormat.cpp @@ -194,13 +194,25 @@ void registerOutputFormatTemplate(FormatFactory & factory) const FormatSettings & settings) { ParsedTemplateFormatString resultset_format; + auto idx_resultset_by_name = [&](const String & partName) + { + return static_cast(TemplateBlockOutputFormat::stringToResultsetPart(partName)); + }; if (settings.template_settings.resultset_format.empty()) { /// Default format string: "${data}" - resultset_format.delimiters.resize(2); - resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); - resultset_format.format_idx_to_column_idx.emplace_back(0); - resultset_format.column_names.emplace_back("data"); + if (settings.template_settings.resultset_format_template.empty()) + { + resultset_format.delimiters.resize(2); + resultset_format.escaping_rules.emplace_back(ParsedTemplateFormatString::EscapingRule::None); + resultset_format.format_idx_to_column_idx.emplace_back(0); + resultset_format.column_names.emplace_back("data"); + } + else + { + resultset_format = ParsedTemplateFormatString(); + resultset_format.parse(settings.template_settings.resultset_format_template, idx_resultset_by_name); + } } else { @@ -208,31 +220,32 @@ void registerOutputFormatTemplate(FormatFactory & factory) resultset_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.resultset_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), - [&](const String & partName) - { - return static_cast(TemplateBlockOutputFormat::stringToResultsetPart(partName)); - }); + idx_resultset_by_name); + if (!settings.template_settings.resultset_format_template.empty()) + { + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_resultset or format_template_resultset_format, but not both"); + } } ParsedTemplateFormatString row_format; - auto idx_by_name = [&](const String & colName) + auto idx_row_by_name = [&](const String & colName) { return sample.getPositionByName(colName); }; if (settings.template_settings.row_format.empty()) { row_format = ParsedTemplateFormatString(); - row_format.parse(settings.template_settings.row_format_schema,idx_by_name); + row_format.parse(settings.template_settings.row_format_template, idx_row_by_name); } else { row_format = ParsedTemplateFormatString( FormatSchemaInfo(settings.template_settings.row_format, "Template", false, settings.schema.is_server, settings.schema.format_schema_path), - idx_by_name); - if (!settings.template_settings.row_format_schema.empty()) + idx_row_by_name); + if (!settings.template_settings.row_format_template.empty()) { - throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_schema_rows_template, but not both"); + throw Exception(DB::ErrorCodes::INVALID_TEMPLATE_FORMAT, "Expected either format_template_row or format_template_row_format, but not both"); } } return std::make_shared(sample, buf, settings, resultset_format, row_format, settings.template_settings.row_between_delimiter); diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.reference b/tests/queries/0_stateless/00937_format_schema_rows_template.reference index 5f59cca2629..85bab456512 100644 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.reference +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.reference @@ -2,3 +2,8 @@ Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; Question: 'Is it opensource?', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 +===== Results ===== +Question: 'How awesome is clickhouse?', Answer: 'unbelievably awesome!', Number of Likes: 456, Date: 2016-01-02; +Question: 'How fast is clickhouse?', Answer: 'Lightning fast!', Number of Likes: 9876543210, Date: 2016-01-03; +Question: 'Is it opensource?', Answer: 'of course it is!', Number of Likes: 789, Date: 2016-01-04 +=================== diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh index aff5de3b555..d773fedfd3d 100755 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.sh +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -5,7 +5,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) # shellcheck source=../shell_config.sh . "$CURDIR"/../shell_config.sh -# Test format_schema_rows_template setting +# Test format_template_row_format setting $CLICKHOUSE_CLIENT --query="DROP TABLE IF EXISTS template"; $CLICKHOUSE_CLIENT --query="CREATE TABLE template (question String, answer String, likes UInt64, date Date) ENGINE = Memory"; @@ -15,17 +15,33 @@ $CLICKHOUSE_CLIENT --query="INSERT INTO template VALUES ('Is it opensource?', 'of course it is!', 789, '2016-01-04')"; $CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ -format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ format_template_rows_between_delimiter = ';\n'"; echo -e "\n" -# Test that if both format_schema_rows_template setting and format_template_row are provided, error is thrown +# Test that if both format_template_row_format setting and format_template_row are provided, error is thrown echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > "$CURDIR"/00937_template_output_format_row.tmp $CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ -format_schema_rows_template = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_rows_between_delimiter = ';\n'; --{clientError 474}" + +# Test format_template_resultset_format setting + +$CLICKHOUSE_CLIENT --query="SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ +format_template_resultset_format = '===== Results ===== \n\${data}\n===================\n', \ +format_template_rows_between_delimiter = ';\n'"; + +# Test that if both format_template_result_format setting and format_template_resultset are provided, error is thrown +echo -ne '===== Resultset ===== \n \${data} \n ===============' > "$CURDIR"/00937_template_output_format_resultset.tmp +$CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ +format_template_resultset = '$CURDIR/00937_template_output_format_resultset.tmp', \ +format_template_resultset_format = '===== Resultset ===== \n \${data} \n ===============', \ +format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ format_template_rows_between_delimiter = ';\n'; --{clientError 474}" $CLICKHOUSE_CLIENT --query="DROP TABLE template"; rm "$CURDIR"/00937_template_output_format_row.tmp +rm "$CURDIR"/00937_template_output_format_resultset.tmp From 8183074500b4d0d0755b48d01ff85215f16c3dfd Mon Sep 17 00:00:00 2001 From: Shaun Struwig <41984034+Blargian@users.noreply.github.com> Date: Mon, 29 Jan 2024 21:32:42 +0100 Subject: [PATCH 166/884] Update src/Core/SettingsChangesHistory.h set previous value as empty string for added setting rather than "none" in SettingsChangesHistory.h Co-authored-by: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> --- src/Core/SettingsChangesHistory.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 960b64e5b6a..4a6a5d15be5 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -102,8 +102,8 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, - {"format_template_row_format", "none", "", "Template row format string can be set directly in query"}, - {"format_template_resultset_format", "none", "", "Template result set format string can be set in query"}, + {"format_template_row_format", "", "", "Template row format string can be set directly in query"}, + {"format_template_resultset_format", "", "", "Template result set format string can be set in query"}, {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}}}, {"23.12", {{"allow_suspicious_ttl_expressions", true, false, "It is a new setting, and in previous versions the behavior was equivalent to allowing."}, From 3f1ec9a9881949c7e676cf11f35fb75df3b95f78 Mon Sep 17 00:00:00 2001 From: Alexey Milovidov Date: Tue, 30 Jan 2024 04:23:16 +0100 Subject: [PATCH 167/884] Fix error --- src/Core/SettingsChangesHistory.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/Core/SettingsChangesHistory.h b/src/Core/SettingsChangesHistory.h index 692d8fc6360..53b14ddc385 100644 --- a/src/Core/SettingsChangesHistory.h +++ b/src/Core/SettingsChangesHistory.h @@ -102,7 +102,7 @@ static std::map sett {"function_visible_width_behavior", 0, 1, "We changed the default behavior of `visibleWidth` to be more precise"}, {"max_estimated_execution_time", 0, 0, "Separate max_execution_time and max_estimated_execution_time"}, {"iceberg_engine_ignore_schema_evolution", false, false, "Allow to ignore schema evolution in Iceberg table engine"}, - {"default_table_engine", DefaultTableEngine::None, DefaultTableEngine::MergeTree, "Set default table engine to MergeTree for better usability"}, + {"default_table_engine", "None", "MergeTree", "Set default table engine to MergeTree for better usability"}, {"optimize_injective_functions_in_group_by", false, true, "Replace injective functions by it's arguments in GROUP BY section in analyzer"}, {"update_insert_deduplication_token_in_dependent_materialized_views", false, false, "Allow to update insert deduplication token with table identifier during insert in dependent materialized views"}, {"azure_max_unexpected_write_error_retries", 4, 4, "The maximum number of retries in case of unexpected errors during Azure blob storage write"}}}, From 0ded5800112f95f7b13ca8d060e743559ce787e6 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 30 Jan 2024 04:03:27 +0000 Subject: [PATCH 168/884] Fix --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 14 +++++++++++++- .../replaceForPositionalArguments.cpp | 19 +++++++++++++++---- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index c683214840b..fbabef87112 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -2170,7 +2170,19 @@ void QueryAnalyzer::replaceNodesWithPositionalArguments(QueryTreeNodePtr & node_ else // Int64 { auto value = constant_node->getValue().get(); - pos = value > 0 ? value : projection_nodes.size() + value + 1; + if (value > 0) + pos = value; + else + { + if (static_cast(std::abs(value)) > projection_nodes.size()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Negtive positional argument number {} is out of bounds. Expected in range [-{}, -1]. In scope {}", + value, + projection_nodes.size(), + scope.scope_node->formatASTForErrorMessage()); + pos = projection_nodes.size() + value + 1; + } } if (!pos || pos > projection_nodes.size()) diff --git a/src/Interpreters/replaceForPositionalArguments.cpp b/src/Interpreters/replaceForPositionalArguments.cpp index bea87ad913a..c72cac25c9d 100644 --- a/src/Interpreters/replaceForPositionalArguments.cpp +++ b/src/Interpreters/replaceForPositionalArguments.cpp @@ -10,7 +10,8 @@ namespace DB namespace ErrorCodes { - extern const int ILLEGAL_TYPE_OF_ARGUMENT; +extern const int BAD_ARGUMENTS; +extern const int ILLEGAL_TYPE_OF_ARGUMENT; } bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * select_query, ASTSelectQuery::Expression expression) @@ -39,7 +40,18 @@ bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * sel else if (which == Field::Types::Int64) { auto value = ast_literal->value.get(); - pos = value > 0 ? value : columns.size() + value + 1; + if (value > 0) + pos = value; + else + { + if (static_cast(std::abs(value)) > columns.size()) + throw Exception( + ErrorCodes::BAD_ARGUMENTS, + "Negtive positional argument number {} is out of bounds. Expected in range [-{}, -1]", + value, + columns.size()); + pos = columns.size() + value + 1; + } } else { @@ -47,8 +59,7 @@ bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * sel } if (!pos || pos > columns.size()) - throw Exception( - ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Positional argument out of bounds: {} (expected in range [1, {}]", pos, columns.size()); + throw Exception(ErrorCodes::BAD_ARGUMENTS, "Positional argument out of bounds: {} (expected in range [1, {}]", pos, columns.size()); const auto & column = columns[--pos]; if (typeid_cast(column.get()) || typeid_cast(column.get())) From 4f12ca249d4d728c403f52c1d68edda68a9af286 Mon Sep 17 00:00:00 2001 From: flynn Date: Tue, 30 Jan 2024 07:01:07 +0000 Subject: [PATCH 169/884] Fix typo --- src/Analyzer/Passes/QueryAnalysisPass.cpp | 2 +- src/Interpreters/replaceForPositionalArguments.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Analyzer/Passes/QueryAnalysisPass.cpp b/src/Analyzer/Passes/QueryAnalysisPass.cpp index 54767c88993..d9434c878d2 100644 --- a/src/Analyzer/Passes/QueryAnalysisPass.cpp +++ b/src/Analyzer/Passes/QueryAnalysisPass.cpp @@ -2180,7 +2180,7 @@ void QueryAnalyzer::replaceNodesWithPositionalArguments(QueryTreeNodePtr & node_ if (static_cast(std::abs(value)) > projection_nodes.size()) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Negtive positional argument number {} is out of bounds. Expected in range [-{}, -1]. In scope {}", + "Negative positional argument number {} is out of bounds. Expected in range [-{}, -1]. In scope {}", value, projection_nodes.size(), scope.scope_node->formatASTForErrorMessage()); diff --git a/src/Interpreters/replaceForPositionalArguments.cpp b/src/Interpreters/replaceForPositionalArguments.cpp index c72cac25c9d..cceb0650fcd 100644 --- a/src/Interpreters/replaceForPositionalArguments.cpp +++ b/src/Interpreters/replaceForPositionalArguments.cpp @@ -47,7 +47,7 @@ bool replaceForPositionalArguments(ASTPtr & argument, const ASTSelectQuery * sel if (static_cast(std::abs(value)) > columns.size()) throw Exception( ErrorCodes::BAD_ARGUMENTS, - "Negtive positional argument number {} is out of bounds. Expected in range [-{}, -1]", + "Negative positional argument number {} is out of bounds. Expected in range [-{}, -1]", value, columns.size()); pos = columns.size() + value + 1; From bab6e6fe3402f8408acba177a6d4318c9b90ea1b Mon Sep 17 00:00:00 2001 From: Kruglov Pavel <48961922+Avogar@users.noreply.github.com> Date: Tue, 30 Jan 2024 11:37:32 +0100 Subject: [PATCH 170/884] Fix tests --- tests/integration/test_storage_azure_blob_storage/test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_storage_azure_blob_storage/test.py b/tests/integration/test_storage_azure_blob_storage/test.py index 75ef50ec12a..e1d636f3831 100644 --- a/tests/integration/test_storage_azure_blob_storage/test.py +++ b/tests/integration/test_storage_azure_blob_storage/test.py @@ -967,7 +967,7 @@ def test_union_schema_inference_mode(cluster): f"desc azureBlobStorage('{storage_account_url}', 'cont', 'test_union_schema_inference*.jsonl', '{account_name}', '{account_key}', 'auto', 'auto', 'auto') settings schema_inference_mode='union', describe_compact_output=1 format TSV", expect_error="true", ) - assert "Cannot extract table structure" in error + assert "CANNOT_EXTRACT_TABLE_STRUCTURE" in error def test_schema_inference_cache(cluster): From 416910db00b2a7fe1cc32aeb7396b494ce2cc2f3 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 30 Jan 2024 13:03:43 +0000 Subject: [PATCH 171/884] Remove unnecessary outer loop --- src/Common/PoolWithFailoverBase.h | 69 ++++++++++++++----------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/src/Common/PoolWithFailoverBase.h b/src/Common/PoolWithFailoverBase.h index 6da4445950c..ef4bb40535f 100644 --- a/src/Common/PoolWithFailoverBase.h +++ b/src/Common/PoolWithFailoverBase.h @@ -249,51 +249,44 @@ PoolWithFailoverBase::getMany( }); std::string fail_messages; - bool finished = false; - while (!finished) + for (size_t i = 0; i < shuffled_pools.size(); ++i) { - for (size_t i = 0; i < shuffled_pools.size(); ++i) + if (up_to_date_count >= max_entries /// Already enough good entries. + || entries_count + failed_pools_count >= nested_pools.size()) /// No more good entries will be produced. + break; + + ShuffledPool & shuffled_pool = shuffled_pools[i]; + TryResult & result = try_results[i]; + if (max_tries && (shuffled_pool.error_count >= max_tries || !result.entry.isNull())) + continue; + + std::string fail_message; + result = try_get_entry(*shuffled_pool.pool, fail_message); + + if (!fail_message.empty()) + fail_messages += fail_message + '\n'; + + if (!result.entry.isNull()) { - if (up_to_date_count >= max_entries /// Already enough good entries. - || entries_count + failed_pools_count >= nested_pools.size()) /// No more good entries will be produced. + ++entries_count; + if (result.is_usable) { - finished = true; - break; + ++usable_count; + if (result.is_up_to_date) + ++up_to_date_count; } + } + else + { + LOG_WARNING(log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message); + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); - ShuffledPool & shuffled_pool = shuffled_pools[i]; - TryResult & result = try_results[i]; - if (max_tries && (shuffled_pool.error_count >= max_tries || !result.entry.isNull())) - continue; + shuffled_pool.error_count = std::min(max_error_cap, shuffled_pool.error_count + 1); - std::string fail_message; - result = try_get_entry(*shuffled_pool.pool, fail_message); - - if (!fail_message.empty()) - fail_messages += fail_message + '\n'; - - if (!result.entry.isNull()) + if (shuffled_pool.error_count >= max_tries) { - ++entries_count; - if (result.is_usable) - { - ++usable_count; - if (result.is_up_to_date) - ++up_to_date_count; - } - } - else - { - LOG_WARNING(log, "Connection failed at try №{}, reason: {}", (shuffled_pool.error_count + 1), fail_message); - ProfileEvents::increment(ProfileEvents::DistributedConnectionFailTry); - - shuffled_pool.error_count = std::min(max_error_cap, shuffled_pool.error_count + 1); - - if (shuffled_pool.error_count >= max_tries) - { - ++failed_pools_count; - ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); - } + ++failed_pools_count; + ProfileEvents::increment(ProfileEvents::DistributedConnectionFailAtAll); } } } From 6624e34580caaac255c39d5edcd1b136007839c9 Mon Sep 17 00:00:00 2001 From: Igor Nikonov Date: Tue, 30 Jan 2024 13:06:47 +0000 Subject: [PATCH 172/884] RemoteQueryExecutor constructors formatting --- src/QueryPipeline/RemoteQueryExecutor.cpp | 36 ++++++++++++++++------- src/QueryPipeline/RemoteQueryExecutor.h | 33 +++++++++++++++------ 2 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/QueryPipeline/RemoteQueryExecutor.cpp b/src/QueryPipeline/RemoteQueryExecutor.cpp index 136a3bb09c6..1caedfc8511 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.cpp +++ b/src/QueryPipeline/RemoteQueryExecutor.cpp @@ -64,9 +64,14 @@ RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor( Connection & connection, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler, const Scalars & scalars_, const Tables & external_tables_, - QueryProcessingStage::Enum stage_, std::optional extension_) + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { create_connections = [this, &connection, throttler, extension_](AsyncCallback) @@ -80,9 +85,14 @@ RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor( std::shared_ptr connection_ptr, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler, const Scalars & scalars_, const Tables & external_tables_, - QueryProcessingStage::Enum stage_, std::optional extension_) + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { create_connections = [this, connection_ptr, throttler, extension_](AsyncCallback) @@ -96,12 +106,18 @@ RemoteQueryExecutor::RemoteQueryExecutor( RemoteQueryExecutor::RemoteQueryExecutor( std::vector && connections_, - const String & query_, const Block & header_, ContextPtr context_, - const ThrottlerPtr & throttler, const Scalars & scalars_, const Tables & external_tables_, - QueryProcessingStage::Enum stage_, std::optional extension_) + const String & query_, + const Block & header_, + ContextPtr context_, + const ThrottlerPtr & throttler, + const Scalars & scalars_, + const Tables & external_tables_, + QueryProcessingStage::Enum stage_, + std::optional extension_) : RemoteQueryExecutor(query_, header_, context_, scalars_, external_tables_, stage_, extension_) { - create_connections = [this, connections_, throttler, extension_](AsyncCallback) mutable { + create_connections = [this, connections_, throttler, extension_](AsyncCallback) mutable + { auto res = std::make_unique(std::move(connections_), context->getSettingsRef(), throttler); if (extension_ && extension_->replica_info) res->setReplicaInfo(*extension_->replica_info); diff --git a/src/QueryPipeline/RemoteQueryExecutor.h b/src/QueryPipeline/RemoteQueryExecutor.h index 444f1258f3e..e874b4be726 100644 --- a/src/QueryPipeline/RemoteQueryExecutor.h +++ b/src/QueryPipeline/RemoteQueryExecutor.h @@ -56,23 +56,38 @@ public: /// Takes already set connection. RemoteQueryExecutor( Connection & connection, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler_ = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional extension_ = std::nullopt); + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler_ = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); /// Takes already set connection. RemoteQueryExecutor( std::shared_ptr connection, - const String & query_, const Block & header_, ContextPtr context_, - ThrottlerPtr throttler_ = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional extension_ = std::nullopt); + const String & query_, + const Block & header_, + ContextPtr context_, + ThrottlerPtr throttler_ = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); /// Accepts several connections already taken from pool. RemoteQueryExecutor( std::vector && connections_, - const String & query_, const Block & header_, ContextPtr context_, - const ThrottlerPtr & throttler = nullptr, const Scalars & scalars_ = Scalars(), const Tables & external_tables_ = Tables(), - QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, std::optional extension_ = std::nullopt); + const String & query_, + const Block & header_, + ContextPtr context_, + const ThrottlerPtr & throttler = nullptr, + const Scalars & scalars_ = Scalars(), + const Tables & external_tables_ = Tables(), + QueryProcessingStage::Enum stage_ = QueryProcessingStage::Complete, + std::optional extension_ = std::nullopt); /// Takes a pool and gets one or several connections from it. RemoteQueryExecutor( From c891ed03c1ae5d85acc1b5f124433d634d23b278 Mon Sep 17 00:00:00 2001 From: Blargian Date: Tue, 30 Jan 2024 14:19:10 +0100 Subject: [PATCH 173/884] update test to use CLICKHOUSE_TEST_UNIQUE_NAME so parallel tests don't fail --- .../00937_format_schema_rows_template.sh | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/tests/queries/0_stateless/00937_format_schema_rows_template.sh b/tests/queries/0_stateless/00937_format_schema_rows_template.sh index d773fedfd3d..0221527f9c9 100755 --- a/tests/queries/0_stateless/00937_format_schema_rows_template.sh +++ b/tests/queries/0_stateless/00937_format_schema_rows_template.sh @@ -20,10 +20,11 @@ format_template_rows_between_delimiter = ';\n'"; echo -e "\n" -# Test that if both format_template_row_format setting and format_template_row are provided, error is thrown -echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > "$CURDIR"/00937_template_output_format_row.tmp +# Test that if both format_template_row_format setting and format_template_row are provided, error is thrown +row_format_file="$CURDIR"/"${CLICKHOUSE_TEST_UNIQUE_NAME}"_template_output_format_row.tmp +echo -ne 'Question: ${question:Quoted}, Answer: ${answer:Quoted}, Number of Likes: ${likes:Raw}, Date: ${date:Raw}' > $row_format_file $CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ -format_template_row = '$CURDIR/00937_template_output_format_row.tmp', \ +format_template_row = '$row_format_file', \ format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ format_template_rows_between_delimiter = ';\n'; --{clientError 474}" @@ -35,13 +36,14 @@ format_template_resultset_format = '===== Results ===== \n\${data}\n============ format_template_rows_between_delimiter = ';\n'"; # Test that if both format_template_result_format setting and format_template_resultset are provided, error is thrown -echo -ne '===== Resultset ===== \n \${data} \n ===============' > "$CURDIR"/00937_template_output_format_resultset.tmp +resultset_output_file="$CURDIR"/"$CLICKHOUSE_TEST_UNIQUE_NAME"_template_output_format_resultset.tmp +echo -ne '===== Resultset ===== \n \${data} \n ===============' > $resultset_output_file $CLICKHOUSE_CLIENT --multiline --multiquery --query "SELECT * FROM template GROUP BY question, answer, likes, date WITH TOTALS ORDER BY date LIMIT 3 FORMAT Template SETTINGS \ -format_template_resultset = '$CURDIR/00937_template_output_format_resultset.tmp', \ +format_template_resultset = '$resultset_output_file', \ format_template_resultset_format = '===== Resultset ===== \n \${data} \n ===============', \ format_template_row_format = 'Question: \${question:Quoted}, Answer: \${answer:Quoted}, Number of Likes: \${likes:Raw}, Date: \${date:Raw}', \ format_template_rows_between_delimiter = ';\n'; --{clientError 474}" $CLICKHOUSE_CLIENT --query="DROP TABLE template"; -rm "$CURDIR"/00937_template_output_format_row.tmp -rm "$CURDIR"/00937_template_output_format_resultset.tmp +rm $row_format_file +rm $resultset_output_file From 7a1458c9227f47de485a06e6e473d059da381631 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 30 Jan 2024 15:21:58 +0000 Subject: [PATCH 174/884] Fix validating suspicious/experimental types in nested types --- .../parseColumnsListForTableFunction.cpp | 29 +++++++++++++++++-- .../02981_nested_bad_types.reference | 0 .../0_stateless/02981_nested_bad_types.sql | 27 +++++++++++++++++ 3 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 tests/queries/0_stateless/02981_nested_bad_types.reference create mode 100644 tests/queries/0_stateless/02981_nested_bad_types.sql diff --git a/src/Interpreters/parseColumnsListForTableFunction.cpp b/src/Interpreters/parseColumnsListForTableFunction.cpp index 551a883d093..fcdad7c93c1 100644 --- a/src/Interpreters/parseColumnsListForTableFunction.cpp +++ b/src/Interpreters/parseColumnsListForTableFunction.cpp @@ -7,6 +7,9 @@ #include #include #include +#include +#include +#include namespace DB @@ -48,8 +51,7 @@ void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings if (!settings.allow_suspicious_fixed_string_types) { - auto basic_type = removeLowCardinalityAndNullable(type); - if (const auto * fixed_string = typeid_cast(basic_type.get())) + if (const auto * fixed_string = typeid_cast(type.get())) { if (fixed_string->getN() > MAX_FIXEDSTRING_SIZE_WITHOUT_SUSPICIOUS) throw Exception( @@ -71,6 +73,29 @@ void validateDataType(const DataTypePtr & type, const DataTypeValidationSettings "Set setting allow_experimental_variant_type = 1 in order to allow it", type->getName()); } } + + if (const auto * nullable_type = typeid_cast(type.get())) + { + validateDataType(nullable_type->getNestedType(), settings); + } + else if (const auto * lc_type = typeid_cast(type.get())) + { + validateDataType(lc_type->getDictionaryType(), settings); + } + else if (const auto * array_type = typeid_cast(type.get())) + { + validateDataType(array_type->getNestedType(), settings); + } + else if (const auto * tuple_type = typeid_cast(type.get())) + { + for (const auto & element : tuple_type->getElements()) + validateDataType(element, settings); + } + else if (const auto * map_type = typeid_cast(type.get())) + { + validateDataType(map_type->getKeyType(), settings); + validateDataType(map_type->getValueType(), settings); + } } ColumnsDescription parseColumnsListFromString(const std::string & structure, const ContextPtr & context) diff --git a/tests/queries/0_stateless/02981_nested_bad_types.reference b/tests/queries/0_stateless/02981_nested_bad_types.reference new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/queries/0_stateless/02981_nested_bad_types.sql b/tests/queries/0_stateless/02981_nested_bad_types.sql new file mode 100644 index 00000000000..663d39cb1e2 --- /dev/null +++ b/tests/queries/0_stateless/02981_nested_bad_types.sql @@ -0,0 +1,27 @@ +set allow_suspicious_low_cardinality_types=0; +set allow_suspicious_fixed_string_types=0; +set allow_experimental_variant_type=0; + +select [42]::Array(LowCardinality(UInt64)); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select [[[42]]]::Array(Array(Array(LowCardinality(UInt64)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select map('a', 42)::Map(String, LowCardinality(UInt64)); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select map('a', map('b', [42]))::Map(String, Map(String, Array(LowCardinality(UInt64)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select tuple('a', 42)::Tuple(String, LowCardinality(UInt64)); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} +select tuple('a', [map('b', 42)])::Tuple(String, Array(Map(String, LowCardinality(UInt64)))); -- {serverError SUSPICIOUS_TYPE_FOR_LOW_CARDINALITY} + +select [42]::Array(FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select [42]::Array(FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select [[[42]]]::Array(Array(Array(FixedString(1000000)))); -- {serverError ILLEGAL_COLUMN} +select map('a', 42)::Map(String, FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select map('a', map('b', [42]))::Map(String, Map(String, Array(FixedString(1000000)))); -- {serverError ILLEGAL_COLUMN} +select tuple('a', 42)::Tuple(String, FixedString(1000000)); -- {serverError ILLEGAL_COLUMN} +select tuple('a', [map('b', 42)])::Tuple(String, Array(Map(String, FixedString(1000000)))); -- {serverError ILLEGAL_COLUMN} + +select [42]::Array(Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select [42]::Array(Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select [[[42]]]::Array(Array(Array(Variant(String, UInt64)))); -- {serverError ILLEGAL_COLUMN} +select map('a', 42)::Map(String, Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select map('a', map('b', [42]))::Map(String, Map(String, Array(Variant(String, UInt64)))); -- {serverError ILLEGAL_COLUMN} +select tuple('a', 42)::Tuple(String, Variant(String, UInt64)); -- {serverError ILLEGAL_COLUMN} +select tuple('a', [map('b', 42)])::Tuple(String, Array(Map(String, Variant(String, UInt64)))); -- {serverError ILLEGAL_COLUMN} + From 0557cdb8a9def2e4c8df81d23cb526153ce023f8 Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 30 Jan 2024 15:31:04 +0000 Subject: [PATCH 175/884] fix due to review --- src/Functions/FunctionBinaryArithmetic.h | 40 ++++++++++-------------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 831c1cf3aeb..62a50f5e0c2 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -156,14 +156,18 @@ public: only_integer && (IsDataTypeDecimal || IsDataTypeDecimal), Switch< Case< - IsDataTypeDecimal || IsDataTypeDecimal, + IsDataTypeDecimal, + Switch< + Case, DataTypeInt256>, + Case, DataTypeInt128>, + Case, DataTypeInt64>, + Case, DataTypeInt32>>>, + Case< + IsDataTypeDecimal, Switch< Case, LeftDataType>, - Case, RightDataType>, - Case || std::is_same_v, DataTypeInt256>, - Case || std::is_same_v, DataTypeInt128>, - Case || std::is_same_v, DataTypeInt64>, - Case || std::is_same_v, DataTypeInt32>>>>>, + Case, DataTypeInt64>, + Case, DataTypeInt32>>>>>, /// Decimal cases Case || IsDataTypeDecimal), InvalidType>, @@ -1684,11 +1688,11 @@ public: { if constexpr (is_div_int || is_div_int_or_zero) { - if constexpr (std::is_same_v || std::is_same_v) + if constexpr (std::is_same_v) type_res = std::make_shared(); - else if constexpr (std::is_same_v || std::is_same_v) + else if constexpr (std::is_same_v) type_res = std::make_shared(); - else if constexpr (std::is_same_v || std::is_same_v) + else if constexpr (std::is_same_v) type_res = std::make_shared(); else type_res = std::make_shared(); @@ -1723,18 +1727,14 @@ public: type_res = std::make_shared(); else if constexpr (std::is_same_v) type_res = std::make_shared(); - else if constexpr (std::is_same_v || std::is_same_v) + else if constexpr (std::is_same_v) type_res = std::make_shared(); else type_res = std::make_shared(); } else if constexpr (is_div_int || is_div_int_or_zero) { - if constexpr (std::is_same_v) - type_res = std::make_shared(); - else if constexpr (std::is_same_v) - type_res = std::make_shared(); - else if constexpr (std::is_same_v || std::is_same_v) + if constexpr (std::is_same_v) type_res = std::make_shared(); else type_res = std::make_shared(); @@ -1744,9 +1744,7 @@ public: } else if constexpr (IsDataTypeDecimal) { - if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegralOrExtended) - type_res = std::make_shared(); - else if constexpr (is_div_int || is_div_int_or_zero) + if constexpr (is_div_int || is_div_int_or_zero) { if constexpr (std::is_same_v) type_res = std::make_shared(); @@ -1766,11 +1764,7 @@ public: type_res = std::make_shared(); else if constexpr (is_div_int || is_div_int_or_zero) { - if constexpr (std::is_same_v) - type_res = std::make_shared(); - else if constexpr (std::is_same_v) - type_res = std::make_shared(); - else if constexpr (std::is_same_v) + if constexpr (std::is_same_v) type_res = std::make_shared(); else type_res = std::make_shared(); From 998c56fc3d3602a1151c7e310863e12666e595e9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 17:36:34 +0100 Subject: [PATCH 176/884] Move code --- src/Compression/CompressionCodecT64.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/Compression/CompressionCodecT64.cpp b/src/Compression/CompressionCodecT64.cpp index 42c6a18aa77..3ddc56fe4f6 100644 --- a/src/Compression/CompressionCodecT64.cpp +++ b/src/Compression/CompressionCodecT64.cpp @@ -168,6 +168,7 @@ TypeIndex baseType(TypeIndex type_idx) return TypeIndex::Int16; case TypeIndex::Int32: case TypeIndex::Decimal32: + case TypeIndex::Date32: return TypeIndex::Int32; case TypeIndex::Int64: case TypeIndex::Decimal64: @@ -180,8 +181,6 @@ TypeIndex baseType(TypeIndex type_idx) case TypeIndex::Enum16: case TypeIndex::Date: return TypeIndex::UInt16; - case TypeIndex::Date32: - return TypeIndex::Int32; case TypeIndex::UInt32: case TypeIndex::DateTime: case TypeIndex::IPv4: From e2a66f8e6594fcb8c95f47a6f2670869c78a4a35 Mon Sep 17 00:00:00 2001 From: avogar Date: Tue, 30 Jan 2024 16:39:52 +0000 Subject: [PATCH 177/884] Fix tests --- tests/queries/0_stateless/02010_array_index_bad_cast.sql | 1 + .../0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/queries/0_stateless/02010_array_index_bad_cast.sql b/tests/queries/0_stateless/02010_array_index_bad_cast.sql index 19c58bb28a7..42a6556fc77 100644 --- a/tests/queries/0_stateless/02010_array_index_bad_cast.sql +++ b/tests/queries/0_stateless/02010_array_index_bad_cast.sql @@ -1,2 +1,3 @@ -- This query throws exception about uncomparable data types (but at least it does not introduce bad cast in code). +SET allow_suspicious_low_cardinality_types=1; SELECT has(materialize(CAST(['2021-07-14'] AS Array(LowCardinality(Nullable(DateTime))))), materialize('2021-07-14'::DateTime64(7))); -- { serverError 44 } diff --git a/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 b/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 index 79a7c654f10..95bac76c591 100644 --- a/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 +++ b/tests/queries/0_stateless/02797_join_nested_lowcardinality_convert.sql.j2 @@ -1,4 +1,4 @@ - +SET allow_suspicious_low_cardinality_types=1; DROP TABLE IF EXISTS test1__fuzz_36; DROP TABLE IF EXISTS test1__fuzz_38; From 0576aa2b7fd060c68f482f8205575bd904356ebe Mon Sep 17 00:00:00 2001 From: yariks5s Date: Tue, 30 Jan 2024 16:45:36 +0000 Subject: [PATCH 178/884] fix fuzzer --- src/Functions/FunctionBinaryArithmetic.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Functions/FunctionBinaryArithmetic.h b/src/Functions/FunctionBinaryArithmetic.h index 62a50f5e0c2..e31183573c3 100644 --- a/src/Functions/FunctionBinaryArithmetic.h +++ b/src/Functions/FunctionBinaryArithmetic.h @@ -1719,7 +1719,7 @@ public: } } else if constexpr (((IsDataTypeDecimal && IsFloatingPoint) || - (IsDataTypeDecimal && IsFloatingPoint)) && !(is_div_int || is_div_int_or_zero)) + (IsDataTypeDecimal && IsFloatingPoint))) { if constexpr ((is_div_int || is_div_int_or_zero) && IsDataTypeDecimal) { @@ -1760,7 +1760,7 @@ public: } else if constexpr (IsDataTypeDecimal) { - if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegral) + if constexpr ((is_div_int || is_div_int_or_zero) && IsIntegralOrExtended) type_res = std::make_shared(); else if constexpr (is_div_int || is_div_int_or_zero) { From 023b8cbd53c1d3788e97d17b0329e3330c1cc0eb Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 30 Jan 2024 17:47:11 +0100 Subject: [PATCH 179/884] Retry disconnects and expired sessions --- .../System/StorageSystemZooKeeper.cpp | 39 +++++++++++++++++-- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index 37fe9074950..9a671f08138 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -424,9 +424,35 @@ void ReadFromSystemZooKeeper::applyFilters() paths = extractPath(getFilterNodes().nodes, context, context->getSettingsRef().allow_unrestricted_reads_from_keeper); } +/// Executes a request to Keeper and retries it in case of expired sessions and disconnects +template +static Result runWithReconnects(Operation && operation, ContextPtr context, QueryStatusPtr query_status) +{ + constexpr int max_retries = 20; /// Limit retries by some reasonable number to avoid infinite loops + for (int attempt = 0; ; ++attempt) + { + if (query_status) + query_status->checkTimeLimit(); + + zkutil::ZooKeeperPtr keeper = context->getZooKeeper(); + + try + { + return operation(keeper); + } + catch (const Coordination::Exception & e) + { + if (!Coordination::isHardwareError(e.code) || + attempt >= max_retries || + e.code == Coordination::Error::ZOPERATIONTIMEOUT) + throw; + } + } +} + void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) { - zkutil::ZooKeeperPtr zookeeper = context->getZooKeeper(); + QueryStatusPtr query_status = context->getProcessListElement(); if (paths.empty()) throw Exception(ErrorCodes::BAD_ARGUMENTS, @@ -448,6 +474,9 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) std::unordered_set added; while (!paths.empty()) { + if (query_status) + query_status->checkTimeLimit(); + list_tasks.clear(); std::vector paths_to_list; while (!paths.empty() && static_cast(list_tasks.size()) < max_inflight_requests) @@ -470,7 +499,9 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) paths_to_list.emplace_back(task.path_corrected); list_tasks.emplace_back(std::move(task)); } - auto list_responses = zookeeper->tryGetChildren(paths_to_list); + auto list_responses = runWithReconnects( + [&paths_to_list](zkutil::ZooKeeperPtr zookeeper) { return zookeeper->tryGetChildren(paths_to_list); }, + context, query_status); struct GetTask { @@ -514,7 +545,9 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) } } - auto get_responses = zookeeper->tryGet(paths_to_get); + auto get_responses = runWithReconnects( + [&paths_to_get](zkutil::ZooKeeperPtr zookeeper) { return zookeeper->tryGet(paths_to_get); }, + context, query_status); for (size_t i = 0, size = get_tasks.size(); i < size; ++i) { From 82c06ca2949601989699afac6a6bddd05ef2d4f6 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 30 Jan 2024 18:31:37 +0100 Subject: [PATCH 180/884] Use ZooKeeperRetriesControl --- .../System/StorageSystemZooKeeper.cpp | 40 ++++--------------- 1 file changed, 8 insertions(+), 32 deletions(-) diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index 9a671f08138..61919f53b24 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -424,32 +425,6 @@ void ReadFromSystemZooKeeper::applyFilters() paths = extractPath(getFilterNodes().nodes, context, context->getSettingsRef().allow_unrestricted_reads_from_keeper); } -/// Executes a request to Keeper and retries it in case of expired sessions and disconnects -template -static Result runWithReconnects(Operation && operation, ContextPtr context, QueryStatusPtr query_status) -{ - constexpr int max_retries = 20; /// Limit retries by some reasonable number to avoid infinite loops - for (int attempt = 0; ; ++attempt) - { - if (query_status) - query_status->checkTimeLimit(); - - zkutil::ZooKeeperPtr keeper = context->getZooKeeper(); - - try - { - return operation(keeper); - } - catch (const Coordination::Exception & e) - { - if (!Coordination::isHardwareError(e.code) || - attempt >= max_retries || - e.code == Coordination::Error::ZOPERATIONTIMEOUT) - throw; - } - } -} - void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) { QueryStatusPtr query_status = context->getProcessListElement(); @@ -499,9 +474,10 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) paths_to_list.emplace_back(task.path_corrected); list_tasks.emplace_back(std::move(task)); } - auto list_responses = runWithReconnects( - [&paths_to_list](zkutil::ZooKeeperPtr zookeeper) { return zookeeper->tryGetChildren(paths_to_list); }, - context, query_status); + + zkutil::ZooKeeper::MultiTryGetChildrenResponse list_responses; + ZooKeeperRetriesControl("", nullptr, ZooKeeperRetriesInfo(20, 1, 1000), query_status).retryLoop( + [&]() { list_responses = context->getZooKeeper()->tryGetChildren(paths_to_list); }); struct GetTask { @@ -545,9 +521,9 @@ void ReadFromSystemZooKeeper::fillData(MutableColumns & res_columns) } } - auto get_responses = runWithReconnects( - [&paths_to_get](zkutil::ZooKeeperPtr zookeeper) { return zookeeper->tryGet(paths_to_get); }, - context, query_status); + zkutil::ZooKeeper::MultiTryGetResponse get_responses; + ZooKeeperRetriesControl("", nullptr, ZooKeeperRetriesInfo(20, 1, 1000), query_status).retryLoop( + [&]() { get_responses = context->getZooKeeper()->tryGet(paths_to_get); }); for (size_t i = 0, size = get_tasks.size(); i < size; ++i) { From c348c4e828e6d2c4978d03936e82e825a5966a59 Mon Sep 17 00:00:00 2001 From: Alexander Gololobov Date: Tue, 30 Jan 2024 19:02:17 +0100 Subject: [PATCH 181/884] Move ZooKeeperRetries.h to Common --- src/Backups/BackupEntriesCollector.h | 2 +- src/Backups/WithRetries.h | 2 +- src/{Storages/MergeTree => Common/ZooKeeper}/ZooKeeperRetries.h | 0 src/Interpreters/executeDDLQueryOnCluster.h | 2 +- src/Storages/MergeTree/ReplicatedMergeTreeSink.h | 2 +- src/Storages/System/StorageSystemZooKeeper.cpp | 2 +- 6 files changed, 5 insertions(+), 5 deletions(-) rename src/{Storages/MergeTree => Common/ZooKeeper}/ZooKeeperRetries.h (100%) diff --git a/src/Backups/BackupEntriesCollector.h b/src/Backups/BackupEntriesCollector.h index bad67e494c4..01e8d594334 100644 --- a/src/Backups/BackupEntriesCollector.h +++ b/src/Backups/BackupEntriesCollector.h @@ -6,7 +6,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Backups/WithRetries.h b/src/Backups/WithRetries.h index 3a6e28996b9..f795a963911 100644 --- a/src/Backups/WithRetries.h +++ b/src/Backups/WithRetries.h @@ -1,6 +1,6 @@ #pragma once -#include +#include #include #include diff --git a/src/Storages/MergeTree/ZooKeeperRetries.h b/src/Common/ZooKeeper/ZooKeeperRetries.h similarity index 100% rename from src/Storages/MergeTree/ZooKeeperRetries.h rename to src/Common/ZooKeeper/ZooKeeperRetries.h diff --git a/src/Interpreters/executeDDLQueryOnCluster.h b/src/Interpreters/executeDDLQueryOnCluster.h index 7daf9babf9f..d3365553875 100644 --- a/src/Interpreters/executeDDLQueryOnCluster.h +++ b/src/Interpreters/executeDDLQueryOnCluster.h @@ -5,7 +5,7 @@ #include #include #include -#include +#include namespace zkutil diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h index bc23204e7d3..29f3183be64 100644 --- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h +++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h @@ -3,7 +3,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/Storages/System/StorageSystemZooKeeper.cpp b/src/Storages/System/StorageSystemZooKeeper.cpp index 61919f53b24..6aa85e6a9e9 100644 --- a/src/Storages/System/StorageSystemZooKeeper.cpp +++ b/src/Storages/System/StorageSystemZooKeeper.cpp @@ -1,7 +1,6 @@ #include #include #include -#include #include #include #include @@ -13,6 +12,7 @@ #include #include #include +#include #include #include #include From 299c390d2b17e118a0fc87a21bc8859d135e006b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 15:56:41 +0100 Subject: [PATCH 182/884] Add some fuzzing to ASTLiterals --- src/Client/QueryFuzzer.cpp | 57 ++++++++++++++++++++++++++++++++++---- src/Client/QueryFuzzer.h | 2 ++ 2 files changed, 54 insertions(+), 5 deletions(-) diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index 629d36e7960..786d5af0cb3 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -903,6 +903,54 @@ void QueryFuzzer::notifyQueryFailed(ASTPtr ast) remove_fuzzed_table(insert->getTable()); } +ASTPtr QueryFuzzer::fuzzLiteralUnderExpressionList(ASTPtr child) +{ + auto * l = child->as(); + chassert(l); + auto type = l->value.getType(); + if (type == Field::Types::Which::String && fuzz_rand() % 7 == 0) + { + String value = l->value.get(); + child = makeASTFunction( + "toFixedString", std::make_shared(value), std::make_shared(static_cast(value.size()))); + } + + if (fuzz_rand() % 11 == 0) + { + String value = l->value.get(); + child = makeASTFunction("toNullable", child); + } + + if (fuzz_rand() % 11 == 0) + { + String value = l->value.get(); + child = makeASTFunction("toLowCardinality", child); + } + + if (fuzz_rand() % 11 == 0) + { + String value = l->value.get(); + child = makeASTFunction("materialize", child); + } + + return child; +} + + +void QueryFuzzer::fuzzExpressionList(ASTExpressionList & expr_list) +{ + for (size_t i = 0; i < expr_list.children.size(); i++) + { + if (auto * literal = typeid_cast(expr_list.children[i].get())) + { + if (fuzz_rand() % 13 == 0) + expr_list.children[i] = fuzzLiteralUnderExpressionList(expr_list.children[i]); + } + else + fuzz(expr_list.children[i]); + } +} + void QueryFuzzer::fuzz(ASTs & asts) { for (auto & ast : asts) @@ -989,7 +1037,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast) } else if (auto * expr_list = typeid_cast(ast.get())) { - fuzz(expr_list->children); + fuzzExpressionList(*expr_list); } else if (auto * order_by_element = typeid_cast(ast.get())) { @@ -1108,7 +1156,7 @@ void QueryFuzzer::fuzz(ASTPtr & ast) } /* * The time to fuzz the settings has not yet come. - * Apparently we don't have any infractructure to validate the values of + * Apparently we don't have any infrastructure to validate the values of * the settings, and the first query with max_block_size = -1 breaks * because of overflows here and there. *//* @@ -1131,9 +1179,8 @@ void QueryFuzzer::fuzz(ASTPtr & ast) // are ASTPtr -- this is redundant ownership, but hides the error if the // child field is replaced. Others can be ASTLiteral * or the like, which // leads to segfault if the pointed-to AST is replaced. - // Replacing children is safe in case of ASTExpressionList. In a more - // general case, we can change the value of ASTLiteral, which is what we - // do here. + // Replacing children is safe in case of ASTExpressionList (done in fuzzExpressionList). In a more + // general case, we can change the value of ASTLiteral, which is what we do here if (fuzz_rand() % 11 == 0) { literal->value = fuzzField(literal->value); diff --git a/src/Client/QueryFuzzer.h b/src/Client/QueryFuzzer.h index 18c7b8a9241..cdeba2b76fd 100644 --- a/src/Client/QueryFuzzer.h +++ b/src/Client/QueryFuzzer.h @@ -95,6 +95,8 @@ struct QueryFuzzer void fuzzExplainSettings(ASTSetQuery & settings_ast, ASTExplainQuery::ExplainKind kind); void fuzzColumnDeclaration(ASTColumnDeclaration & column); void fuzzTableName(ASTTableExpression & table); + ASTPtr fuzzLiteralUnderExpressionList(ASTPtr child); + void fuzzExpressionList(ASTExpressionList & expr_list); void fuzz(ASTs & asts); void fuzz(ASTPtr & ast); void collectFuzzInfoMain(ASTPtr ast); From 4b5e992565b060cc002495f8c58cceb79c75d53a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 19:33:31 +0100 Subject: [PATCH 183/884] Fix problems --- src/Client/QueryFuzzer.cpp | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/src/Client/QueryFuzzer.cpp b/src/Client/QueryFuzzer.cpp index 786d5af0cb3..bb551fcb11e 100644 --- a/src/Client/QueryFuzzer.cpp +++ b/src/Client/QueryFuzzer.cpp @@ -916,22 +916,13 @@ ASTPtr QueryFuzzer::fuzzLiteralUnderExpressionList(ASTPtr child) } if (fuzz_rand() % 11 == 0) - { - String value = l->value.get(); child = makeASTFunction("toNullable", child); - } if (fuzz_rand() % 11 == 0) - { - String value = l->value.get(); child = makeASTFunction("toLowCardinality", child); - } if (fuzz_rand() % 11 == 0) - { - String value = l->value.get(); child = makeASTFunction("materialize", child); - } return child; } @@ -939,15 +930,15 @@ ASTPtr QueryFuzzer::fuzzLiteralUnderExpressionList(ASTPtr child) void QueryFuzzer::fuzzExpressionList(ASTExpressionList & expr_list) { - for (size_t i = 0; i < expr_list.children.size(); i++) + for (auto & child : expr_list.children) { - if (auto * literal = typeid_cast(expr_list.children[i].get())) + if (auto * literal = typeid_cast(child.get())) { if (fuzz_rand() % 13 == 0) - expr_list.children[i] = fuzzLiteralUnderExpressionList(expr_list.children[i]); + child = fuzzLiteralUnderExpressionList(child); } else - fuzz(expr_list.children[i]); + fuzz(child); } } From a3f0546f48af77d7c120a7e71d94b992a4446e2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 19:44:55 +0100 Subject: [PATCH 184/884] Handle both fuzzer.log and fuzzer.log.ztd --- tests/ci/ast_fuzzer_check.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 41e4ef19361..95a887484f2 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -113,7 +113,6 @@ def main(): paths = { "run.log": run_log_path, "main.log": main_log_path, - "fuzzer.log": workspace_path / "fuzzer.log", "report.html": workspace_path / "report.html", "core.zst": workspace_path / "core.zst", "dmesg.log": workspace_path / "dmesg.log", @@ -129,6 +128,14 @@ def main(): if not_compressed_server_log_path.exists(): paths["server.log"] = not_compressed_server_log_path + # Same idea but with the fuzzer log + compressed_fuzzer_log_path = workspace_path / "fuzzer.log.zst" + if compressed_fuzzer_log_path.exists(): + paths["fuzzer.log.zst"] = compressed_fuzzer_log_path + not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" + if not_compressed_fuzzer_log_path.exists(): + paths["fuzzer.log"] = not_compressed_fuzzer_log_path + # Try to get status message saved by the fuzzer try: with open(workspace_path / "status.txt", "r", encoding="utf-8") as status_f: From 4f0c78d66557bd74d21796ce2ea661132c26abc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ra=C3=BAl=20Mar=C3=ADn?= Date: Tue, 30 Jan 2024 20:25:26 +0100 Subject: [PATCH 185/884] Upload one file. Save the planet --- docker/test/fuzzer/run-fuzzer.sh | 4 ++-- tests/ci/ast_fuzzer_check.py | 18 +++++++++--------- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh index 050d4b68628..ca6bff9c6be 100755 --- a/docker/test/fuzzer/run-fuzzer.sh +++ b/docker/test/fuzzer/run-fuzzer.sh @@ -389,8 +389,8 @@ fi rg --text -F '' server.log > fatal.log ||: dmesg -T > dmesg.log ||: -zstd --threads=0 server.log -zstd --threads=0 fuzzer.log +zstd --threads=0 --rm server.log +zstd --threads=0 --rm fuzzer.log cat > report.html < diff --git a/tests/ci/ast_fuzzer_check.py b/tests/ci/ast_fuzzer_check.py index 95a887484f2..26ce7f5140b 100644 --- a/tests/ci/ast_fuzzer_check.py +++ b/tests/ci/ast_fuzzer_check.py @@ -121,20 +121,20 @@ def main(): compressed_server_log_path = workspace_path / "server.log.zst" if compressed_server_log_path.exists(): paths["server.log.zst"] = compressed_server_log_path - - # The script can fail before the invocation of `zstd`, but we are still interested in its log: - - not_compressed_server_log_path = workspace_path / "server.log" - if not_compressed_server_log_path.exists(): - paths["server.log"] = not_compressed_server_log_path + else: + # The script can fail before the invocation of `zstd`, but we are still interested in its log: + not_compressed_server_log_path = workspace_path / "server.log" + if not_compressed_server_log_path.exists(): + paths["server.log"] = not_compressed_server_log_path # Same idea but with the fuzzer log compressed_fuzzer_log_path = workspace_path / "fuzzer.log.zst" if compressed_fuzzer_log_path.exists(): paths["fuzzer.log.zst"] = compressed_fuzzer_log_path - not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" - if not_compressed_fuzzer_log_path.exists(): - paths["fuzzer.log"] = not_compressed_fuzzer_log_path + else: + not_compressed_fuzzer_log_path = workspace_path / "fuzzer.log" + if not_compressed_fuzzer_log_path.exists(): + paths["fuzzer.log"] = not_compressed_fuzzer_log_path # Try to get status message saved by the fuzzer try: From 17ab2674f4c8ad7a09194659e0a0c86d4440f203 Mon Sep 17 00:00:00 2001 From: Nikita Taranov Date: Tue, 30 Jan 2024 20:35:10 +0100 Subject: [PATCH 186/884] impl --- src/Common/ElapsedTimeProfileEventIncrement.h | 3 +- src/Common/ProfileEvents.cpp | 7 +++ .../MergeTreeDataPartWriterOnDisk.cpp | 15 ++++++ .../MergeTree/MergeTreeDataPartWriterOnDisk.h | 2 + .../MergeTree/MergeTreeDataWriter.cpp | 47 ++++++++++++++----- 5 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/Common/ElapsedTimeProfileEventIncrement.h b/src/Common/ElapsedTimeProfileEventIncrement.h index b30afd24a4c..731295a4cfd 100644 --- a/src/Common/ElapsedTimeProfileEventIncrement.h +++ b/src/Common/ElapsedTimeProfileEventIncrement.h @@ -14,12 +14,13 @@ enum Time Seconds, }; -template